from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
# Utilities
from time import time
from PIL import Image
from zipfile import ZipFile
import os, sys, itertools, re
import warnings, pickle, string
!pip install ftfy
from ftfy import fix_encoding, fix_text, badness
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# Translation APIs
!pip install goslate
from goslate import Goslate # Provided by Google
# Numerical calculation
import numpy as np
# Data Handling
import pandas as pd
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
# Sequential Modeling
import keras.backend as K
from keras.datasets import imdb
from keras.models import Sequential, Model
from keras.layers.merge import Concatenate
from keras.layers import Input, Dropout, Flatten, Dense, Embedding, LSTM, GRU
from keras.layers import BatchNormalization, TimeDistributed, Conv1D, MaxPooling1D
from keras.constraints import max_norm, unit_norm
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
# Traditional Modeling
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Tools & Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report, auc
from sklearn.metrics import roc_curve, accuracy_score, precision_recall_curve
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
# NLP toolkits
import spacy
import nltk
from nltk import tokenize
# Configure for any default setting of any library
warnings.filterwarnings('ignore')
get_ipython().magic(u'matplotlib inline')
plt.style.use('ggplot')
init_notebook_mode(connected=True)
cf.go_offline()
%matplotlib inline
Collecting ftfy
Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
|████████████████████████████████| 53 kB 1.6 MB/s
Requirement already satisfied: wcwidth>=0.2.5 in /usr/local/lib/python3.7/dist-packages (from ftfy) (0.2.5)
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Collecting goslate
Downloading goslate-1.5.2.tar.gz (16 kB)
Collecting futures
Downloading futures-3.0.5.tar.gz (25 kB)
WARNING: Discarding https://files.pythonhosted.org/packages/55/db/97c1ca37edab586a1ae03d6892b6633d8eaa23b23ac40c7e5bbc55423c78/futures-3.0.5.tar.gz#sha256=0542525145d5afc984c88f914a0c85c77527f65946617edb5274f72406f981df (from https://pypi.org/simple/futures/). Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output.
Downloading futures-3.0.4.tar.gz (25 kB)
WARNING: Discarding https://files.pythonhosted.org/packages/8d/73/b5fff618482bc06c9711e7cdc0d5d7eb1904d35898f48f2d7f9696b08bef/futures-3.0.4.tar.gz#sha256=19485d83f7bd2151c0aeaf88fbba3ee50dadfb222ffc3b66a344ef4952b782a3 (from https://pypi.org/simple/futures/). Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output.
Downloading futures-3.0.3.tar.gz (24 kB)
WARNING: Discarding https://files.pythonhosted.org/packages/4c/dc/f9473006d4c9c52d4a4e977173fbcbfb1a8ef3a57e32e885edf994fd4a45/futures-3.0.3.tar.gz#sha256=2fe2342bb4fe8b8e217f0d21b5921cbe5408bf966d9f92025e707e881b198bed (from https://pypi.org/simple/futures/). Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output.
Downloading futures-3.0.2.tar.gz (24 kB)
WARNING: Discarding https://files.pythonhosted.org/packages/f8/e7/fc0fcbeb9193ba2d4de00b065e7fd5aecd0679e93ce95a07322b2b1434f4/futures-3.0.2.tar.gz#sha256=dc3fc91508e49e0fd2f8625f0132d16e49c80f882e7e1d565c56b0d5dfbae257 (from https://pypi.org/simple/futures/). Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output.
Downloading futures-3.0.1.tar.gz (24 kB)
WARNING: Discarding https://files.pythonhosted.org/packages/b2/2c/6b6a57379e47031c6f52e625e0e2b8f6702a8d1f61b6e0daee391e82c187/futures-3.0.1.tar.gz#sha256=f78f2ef458639d72a625cf9c7643cf5442bb222ac11c12bcc445c6ad1cd862e2 (from https://pypi.org/simple/futures/). Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output.
Downloading futures-3.0.0.tar.gz (24 kB)
WARNING: Discarding https://files.pythonhosted.org/packages/ea/c9/35287369718fc05059e7a9d0d73c53745fe981010b4185b3858e7d46eff1/futures-3.0.0.tar.gz#sha256=d9cd7bb09aa01f0e4940af64c31fbd7045098b7b4354420d7838ea39e8b86ee3 (from https://pypi.org/simple/futures/). Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output.
Downloading futures-2.2.0-py2.py3-none-any.whl (16 kB)
Building wheels for collected packages: goslate
Building wheel for goslate (setup.py) ... done
Created wheel for goslate: filename=goslate-1.5.2-py3-none-any.whl size=11436 sha256=6686c7d37ecb7d4e0821f1850221cb808d5f864ea822eeb878a9d5781716ff75
Stored in directory: /root/.cache/pip/wheels/a8/8a/c4/85425eac5e0746fd5fc898801858331e55ac386f476d65e58d
Successfully built goslate
Installing collected packages: futures, goslate
Successfully installed futures-2.2.0 goslate-1.5.2
if 'google.colab' in sys.modules:
project_path = "/content/drive/My Drive/Colab Notebooks/"
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
sys.path.append(project_path)
%cd $project_path
print('Current working directory', os.getcwd())
Mounted at /content/drive/ /content/drive/My Drive/Colab Notebooks Current working directory /content/drive/My Drive/Colab Notebooks
df = pd.read_excel('/content/drive/MyDrive/Dataset.xlsx', )
df.head()
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 0 | login issue | -verified user details.(employee# & manager na... | spxjnwir pjlcoqds | GRP_0 |
| 1 | outlook | _x000D_\n_x000D_\nreceived from: hmjdrvpb.komu... | hmjdrvpb komuaywn | GRP_0 |
| 2 | cant log in to vpn | _x000D_\n_x000D_\nreceived from: eylqgodm.ybqk... | eylqgodm ybqkwiam | GRP_0 |
| 3 | unable to access hr_tool page | unable to access hr_tool page | xbkucsvz gcpydteq | GRP_0 |
| 4 | skype error | skype error | owlgqjme qhcozdfx | GRP_0 |
df.tail()
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 8495 | emails not coming in from zz mail | _x000D_\n_x000D_\nreceived from: avglmrts.vhqm... | avglmrts vhqmtiua | GRP_29 |
| 8496 | telephony_software issue | telephony_software issue | rbozivdq gmlhrtvp | GRP_0 |
| 8497 | vip2: windows password reset for tifpdchb pedx... | vip2: windows password reset for tifpdchb pedx... | oybwdsgx oxyhwrfz | GRP_0 |
| 8498 | machine não está funcionando | i am unable to access the machine utilities to... | ufawcgob aowhxjky | GRP_62 |
| 8499 | an mehreren pc`s lassen sich verschiedene prgr... | an mehreren pc`s lassen sich verschiedene prgr... | kqvbrspl jyzoklfx | GRP_49 |
print('No of rows:\033[1m', df.shape[0], '\033[0m')
print('No of cols:\033[1m', df.shape[1], '\033[0m')
No of rows: 8500 No of cols: 4
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8500 entries, 0 to 8499 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Short description 8492 non-null object 1 Description 8499 non-null object 2 Caller 8500 non-null object 3 Assignment group 8500 non-null object dtypes: object(4) memory usage: 265.8+ KB
df.describe()
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| count | 8492 | 8499 | 8500 | 8500 |
| unique | 7481 | 7817 | 2950 | 74 |
| top | password reset | the | bpctwhsn kzqsbmtp | GRP_0 |
| freq | 38 | 56 | 810 | 3976 |
df[df.Description == 'the'].head()
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 1049 | reset passwords for soldfnbq uhnbsvqd using pa... | the | soldfnbq uhnbsvqd | GRP_17 |
| 1054 | reset passwords for fygrwuna gomcekzi using pa... | the | fygrwuna gomcekzi | GRP_17 |
| 1144 | reset passwords for wvdxnkhf jirecvta using pa... | the | wvdxnkhf jirecvta | GRP_17 |
| 1184 | reset passwords for pxvjczdt kizsjfpq using pa... | the | pxvjczdt kizsjfpq | GRP_17 |
| 1292 | reset passwords for cubdsrml znewqgop using pa... | the | cubdsrml znewqgop | GRP_17 |
df.isnull().sum()
Short description 8 Description 1 Caller 0 Assignment group 0 dtype: int64
df[pd.isnull(df).any(axis=1)]
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 2604 | NaN | _x000D_\n_x000D_\nreceived from: ohdrnswl.rezu... | ohdrnswl rezuibdt | GRP_34 |
| 3383 | NaN | _x000D_\n-connected to the user system using t... | qftpazns fxpnytmk | GRP_0 |
| 3906 | NaN | -user unable tologin to vpn._x000D_\n-connect... | awpcmsey ctdiuqwe | GRP_0 |
| 3910 | NaN | -user unable tologin to vpn._x000D_\n-connect... | rhwsmefo tvphyura | GRP_0 |
| 3915 | NaN | -user unable tologin to vpn._x000D_\n-connect... | hxripljo efzounig | GRP_0 |
| 3921 | NaN | -user unable tologin to vpn._x000D_\n-connect... | cziadygo veiosxby | GRP_0 |
| 3924 | NaN | name:wvqgbdhm fwchqjor\nlanguage:\nbrowser:mic... | wvqgbdhm fwchqjor | GRP_0 |
| 4341 | NaN | _x000D_\n_x000D_\nreceived from: eqmuniov.ehxk... | eqmuniov ehxkcbgj | GRP_0 |
| 4395 | i am locked out of skype | NaN | viyglzfo ajtfzpkb | GRP_0 |
df.isnull().sum()
Short description 8 Description 1 Caller 0 Assignment group 0 dtype: int64
df_copy=df.astype(str)
duplicateRows_df=df_copy[df_copy.duplicated()]
print(duplicateRows_df)
print(duplicateRows_df.sum())
Short description \
51 call for ecwtrjnq jpecxuty
229 call for ecwtrjnq jpecxuty
493 ticket update on inplant_872730
512 blank call //gso
667 job bkbackup_tool_powder_prod_full failed in j...
... ...
7836 probleme mit erpgui \tmqfjard qzhgdoua
8051 issue on pricing in distributor_tool
8093 reset passwords for prgthyuulla ramdntythanjes...
8347 blank call // loud noise
8405 unable to launch outlook
Description Caller \
51 call for ecwtrjnq jpecxuty olckhmvx pcqobjnd
229 call for ecwtrjnq jpecxuty olckhmvx pcqobjnd
493 ticket update on inplant_872730 fumkcsji sarmtlhy
512 blank call //gso rbozivdq gmlhrtvp
667 received from: monitoring_tool@company.com_x00... bpctwhsn kzqsbmtp
... ... ...
7836 probleme mit erpgui \tmqfjard qzhgdoua tmqfjard qzhgdoua
8051 we have agreed price with many of the distribu... hbmwlprq ilfvyodx
8093 the boirqctx bkijgqry
8347 blank call // loud noise rbozivdq gmlhrtvp
8405 unable to launch outlook wjtzrmqc ikqpbflg
Assignment group
51 GRP_0
229 GRP_0
493 GRP_0
512 GRP_0
667 GRP_8
... ...
7836 GRP_24
8051 GRP_21
8093 GRP_17
8347 GRP_0
8405 GRP_0
[83 rows x 4 columns]
Short description call for ecwtrjnq jpecxutycall for ecwtrjnq jp...
Description call for ecwtrjnq jpecxutycall for ecwtrjnq jp...
Caller olckhmvx pcqobjndolckhmvx pcqobjndfumkcsji sar...
Assignment group GRP_0GRP_0GRP_0GRP_0GRP_8GRP_0GRP_8GRP_0GRP_0G...
dtype: object
def rm_duplicate(text):
text.drop_duplicates(['Short description', 'Description', 'Caller', 'Assignment group'],inplace=True)
return text
rm_duplicate(df_copy)
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 0 | login issue | -verified user details.(employee# & manager na... | spxjnwir pjlcoqds | GRP_0 |
| 1 | outlook | _x000D_\n_x000D_\nreceived from: hmjdrvpb.komu... | hmjdrvpb komuaywn | GRP_0 |
| 2 | cant log in to vpn | _x000D_\n_x000D_\nreceived from: eylqgodm.ybqk... | eylqgodm ybqkwiam | GRP_0 |
| 3 | unable to access hr_tool page | unable to access hr_tool page | xbkucsvz gcpydteq | GRP_0 |
| 4 | skype error | skype error | owlgqjme qhcozdfx | GRP_0 |
| ... | ... | ... | ... | ... |
| 8495 | emails not coming in from zz mail | _x000D_\n_x000D_\nreceived from: avglmrts.vhqm... | avglmrts vhqmtiua | GRP_29 |
| 8496 | telephony_software issue | telephony_software issue | rbozivdq gmlhrtvp | GRP_0 |
| 8497 | vip2: windows password reset for tifpdchb pedx... | vip2: windows password reset for tifpdchb pedx... | oybwdsgx oxyhwrfz | GRP_0 |
| 8498 | machine não está funcionando | i am unable to access the machine utilities to... | ufawcgob aowhxjky | GRP_62 |
| 8499 | an mehreren pc`s lassen sich verschiedene prgr... | an mehreren pc`s lassen sich verschiedene prgr... | kqvbrspl jyzoklfx | GRP_49 |
8417 rows × 4 columns
We can address NULL/Missing values in the dataset in a variety of ways, including:
df['Assignment group'].value_counts().plot(kind='bar', figsize=(20,10), title='Class Label Distribution')
<matplotlib.axes._subplots.AxesSubplot at 0x7fb2fd023750>
Mojibake
Mojibake is the garbled text that is the result of text being decoded using an unintended character encoding. The result is a systematic replacement of symbols with completely unrelated ones, often from a different writing system.
This display may include the generic replacement character ("�") in places where the binary representation is considered invalid. A replacement can also involve multiple consecutive symbols, as viewed in one encoding, when the same binary code constitutes one symbol in the other encoding. This is either because of differing constant length encoding (as in Asian 16-bit encodings vs European 8-bit encodings), or the use of variable length encodings (notably UTF-8 and UTF-16). Few such Mojibakes are ¶, ç, å, €, æ, œ, º, ‡, ¼, ¥ etc.
As we're dealing with Natural Language and the source of the data is unknown to us, let's run the encoding check to figure out if the dataset is Mojibake impacted.
The library ftfy (Fixes Text For You) has a greater ability to detect, fix and deal with such Mojibakes. It fixes Unicode that’s broken in various ways. The goal of ftfy is to take in bad Unicode and output good Unicode.
MOJIBAKE_CATEGORIES = {
# Characters that appear in many different contexts. Sequences that contain
# them are not inherently mojibake
"common": (
"\N{NO-BREAK SPACE}"
"\N{SOFT HYPHEN}"
"\N{MIDDLE DOT}"
"\N{ACUTE ACCENT}"
"\N{EN DASH}"
"\N{EM DASH}"
"\N{HORIZONTAL BAR}"
"\N{HORIZONTAL ELLIPSIS}"
"\N{RIGHT SINGLE QUOTATION MARK}"
),
# the C1 control character range, which have no uses outside of mojibake anymore
"c1": "\x80-\x9f",
# Characters that are nearly 100% used in mojibake
"bad": (
"\N{BROKEN BAR}"
"\N{CURRENCY SIGN}"
"\N{DIAERESIS}"
"\N{NOT SIGN}"
"\N{MACRON}"
"\N{PILCROW SIGN}"
"\N{SECTION SIGN}"
"\N{CEDILLA}"
"\N{LATIN SMALL LETTER F WITH HOOK}"
"\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # it's not a modifier
"\N{CARON}"
"\N{BREVE}"
"\N{OGONEK}"
"\N{SMALL TILDE}"
"\N{DAGGER}"
"\N{DOUBLE DAGGER}"
"\N{PER MILLE SIGN}"
"\N{REVERSED NOT SIGN}"
"\N{LOZENGE}"
"\ufffd"
# Theoretically these would appear in 'numeric' contexts, but when they
# co-occur with other mojibake characters, it's not really ambiguous
"\N{FEMININE ORDINAL INDICATOR}"
"\N{MASCULINE ORDINAL INDICATOR}"
),
"currency": (
"\N{CENT SIGN}"
"\N{POUND SIGN}"
"\N{YEN SIGN}"
"\N{PESETA SIGN}"
"\N{EURO SIGN}"
),
"start_punctuation": (
"\N{INVERTED EXCLAMATION MARK}"
"\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}"
"\N{INVERTED QUESTION MARK}"
"\N{COPYRIGHT SIGN}"
"\N{GREEK TONOS}"
"\N{GREEK DIALYTIKA TONOS}"
"\N{LEFT SINGLE QUOTATION MARK}"
"\N{SINGLE LOW-9 QUOTATION MARK}"
"\N{LEFT DOUBLE QUOTATION MARK}"
"\N{DOUBLE LOW-9 QUOTATION MARK}"
"\N{BULLET}"
"\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}"
"\uf8ff" # OS-specific symbol, usually the Apple logo
),
"end_punctuation": (
"\N{REGISTERED SIGN}"
"\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
"\N{DOUBLE ACUTE ACCENT}"
"\N{RIGHT DOUBLE QUOTATION MARK}"
"\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}"
"\N{TRADE MARK SIGN}"
),
"numeric": (
"\N{SUPERSCRIPT TWO}"
"\N{SUPERSCRIPT THREE}"
"\N{SUPERSCRIPT ONE}"
"\N{PLUS-MINUS SIGN}"
"\N{VULGAR FRACTION ONE QUARTER}"
"\N{VULGAR FRACTION ONE HALF}"
"\N{VULGAR FRACTION THREE QUARTERS}"
"\N{MULTIPLICATION SIGN}"
"\N{MICRO SIGN}"
"\N{DIVISION SIGN}"
"\N{FRACTION SLASH}"
"\N{PARTIAL DIFFERENTIAL}"
"\N{INCREMENT}"
"\N{N-ARY PRODUCT}"
"\N{N-ARY SUMMATION}"
"\N{SQUARE ROOT}"
"\N{INFINITY}"
"\N{INTERSECTION}"
"\N{INTEGRAL}"
"\N{ALMOST EQUAL TO}"
"\N{NOT EQUAL TO}"
"\N{IDENTICAL TO}"
"\N{LESS-THAN OR EQUAL TO}"
"\N{GREATER-THAN OR EQUAL TO}"
"\N{NUMERO SIGN}"
),
# Letters that might be used to make emoticon faces (kaomoji), and
# therefore might need to appear in more improbable-looking contexts.
#
# These are concatenated character ranges for use in a regex. I know
# they look like faces themselves. I think expressing the ranges like
# this helps to illustrate why we need to be careful with these
# characters.
"kaomoji": (
"Ò-Ö"
"Ù-Ü"
"ò-ö"
"ø-ü"
"\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}"
"\N{DEGREE SIGN}"
),
"upper_accented": (
# LATIN CAPITAL LETTER A WITH GRAVE - LATIN CAPITAL LETTER N WITH TILDE
"\xc0-\xd1"
# skip capital O's and U's that could be used in kaomoji, but
# include Ø because it's very common in Arabic mojibake:
"\N{LATIN CAPITAL LETTER O WITH STROKE}"
"\N{LATIN CAPITAL LETTER U WITH DIAERESIS}"
"\N{LATIN CAPITAL LETTER Y WITH ACUTE}"
"\N{LATIN CAPITAL LETTER A WITH BREVE}"
"\N{LATIN CAPITAL LETTER A WITH OGONEK}"
"\N{LATIN CAPITAL LETTER C WITH ACUTE}"
"\N{LATIN CAPITAL LETTER C WITH CARON}"
"\N{LATIN CAPITAL LETTER D WITH CARON}"
"\N{LATIN CAPITAL LETTER D WITH STROKE}"
"\N{LATIN CAPITAL LETTER E WITH OGONEK}"
"\N{LATIN CAPITAL LETTER E WITH CARON}"
"\N{LATIN CAPITAL LETTER G WITH BREVE}"
"\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}"
"\N{LATIN CAPITAL LETTER L WITH ACUTE}"
"\N{LATIN CAPITAL LETTER L WITH CARON}"
"\N{LATIN CAPITAL LETTER L WITH STROKE}"
"\N{LATIN CAPITAL LETTER N WITH ACUTE}"
"\N{LATIN CAPITAL LETTER N WITH CARON}"
"\N{LATIN CAPITAL LIGATURE OE}"
"\N{LATIN CAPITAL LETTER R WITH CARON}"
"\N{LATIN CAPITAL LETTER S WITH ACUTE}"
"\N{LATIN CAPITAL LETTER S WITH CEDILLA}"
"\N{LATIN CAPITAL LETTER S WITH CARON}"
"\N{LATIN CAPITAL LETTER T WITH CEDILLA}"
"\N{LATIN CAPITAL LETTER T WITH CARON}"
"\N{LATIN CAPITAL LETTER U WITH RING ABOVE}"
"\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}"
"\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}"
"\N{LATIN CAPITAL LETTER Z WITH ACUTE}"
"\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}"
"\N{LATIN CAPITAL LETTER Z WITH CARON}"
"\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}"
),
"lower_accented": (
"\N{LATIN SMALL LETTER SHARP S}"
# LATIN SMALL LETTER A WITH GRAVE - LATIN SMALL LETTER N WITH TILDE
"\xe0-\xf1"
# skip o's and u's that could be used in kaomoji
"\N{LATIN SMALL LETTER A WITH BREVE}"
"\N{LATIN SMALL LETTER A WITH OGONEK}"
"\N{LATIN SMALL LETTER C WITH ACUTE}"
"\N{LATIN SMALL LETTER C WITH CARON}"
"\N{LATIN SMALL LETTER D WITH CARON}"
"\N{LATIN SMALL LETTER D WITH STROKE}"
"\N{LATIN SMALL LETTER E WITH OGONEK}"
"\N{LATIN SMALL LETTER E WITH CARON}"
"\N{LATIN SMALL LETTER G WITH BREVE}"
"\N{LATIN SMALL LETTER L WITH ACUTE}"
"\N{LATIN SMALL LETTER L WITH CARON}"
"\N{LATIN SMALL LETTER L WITH STROKE}"
"\N{LATIN SMALL LIGATURE OE}"
"\N{LATIN SMALL LETTER R WITH ACUTE}"
"\N{LATIN SMALL LETTER S WITH ACUTE}"
"\N{LATIN SMALL LETTER S WITH CEDILLA}"
"\N{LATIN SMALL LETTER S WITH CARON}"
"\N{LATIN SMALL LETTER T WITH CARON}"
"\N{LATIN SMALL LETTER U WITH DIAERESIS}"
"\N{LATIN SMALL LETTER Z WITH ACUTE}"
"\N{LATIN SMALL LETTER Z WITH DOT ABOVE}"
"\N{LATIN SMALL LETTER Z WITH CARON}"
"\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}"
"\N{LATIN SMALL LIGATURE FI}"
"\N{LATIN SMALL LIGATURE FL}"
),
"upper_common": (
"\N{LATIN CAPITAL LETTER THORN}"
"\N{GREEK CAPITAL LETTER ALPHA}-\N{GREEK CAPITAL LETTER OMEGA}"
# not included under 'accented' because these can commonly
# occur at ends of words, in positions where they'd be detected
# as mojibake
"\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}"
"\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}"
"\N{GREEK CAPITAL LETTER ETA WITH TONOS}"
"\N{GREEK CAPITAL LETTER IOTA WITH TONOS}"
"\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}"
"\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}"
"\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}"
"\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}"
"\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}"
"\N{CYRILLIC CAPITAL LETTER IO}-\N{CYRILLIC CAPITAL LETTER YA}"
),
"lower_common": (
# lowercase thorn does not appear in mojibake
"\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER OMEGA}"
"\N{GREEK SMALL LETTER ALPHA WITH TONOS}"
"\N{GREEK SMALL LETTER EPSILON WITH TONOS}"
"\N{GREEK SMALL LETTER ETA WITH TONOS}"
"\N{GREEK SMALL LETTER IOTA WITH TONOS}"
"\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}"
"\N{CYRILLIC SMALL LETTER A}-\N{CYRILLIC SMALL LETTER DZHE}"
),
"box": (
# omit the single horizontal line, might be used in kaomoji
"│┌┐┘├┤┬┼"
"\N{BOX DRAWINGS DOUBLE HORIZONTAL}-\N{BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL}"
"▀▄█▌▐░▒▓"
),
}
import warnings
import re
def sequence_weirdness(text):
"""
This was the name of the heuristic used in ftfy 2.x through 5.x. As an
attempt at compatibility with external code that calls the heuristic
directly, we redirect to our new heuristic, :func:`badness`.
"""
warnings.warn(
"`sequence_weirdness()` is an old heuristic, and the current "
"closest equivalent is `ftfy.badness.badness()`"
)
return badness(text)
BADNESS_RE = re.compile(
r"""
[{c1}]
|
[{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}] [{bad}]
|
[a-zA-Z] [{lower_common}{upper_common}] [{bad}]
|
[{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}]
|
[{lower_accented}{lower_common}{box}{end_punctuation}{currency}{numeric}] [{upper_accented}]
|
[{box}{end_punctuation}{currency}{numeric}] [{lower_accented}]
|
# leave out [upper_accented][currency] without further info, because it's used in some
# fancy leetspeak-esque writing
[{lower_accented}{box}{end_punctuation}] [{currency}]
|
\s [{upper_accented}] [{currency}]
|
[{upper_accented}{box}] [{numeric}]
|
[{lower_accented}{upper_accented}{box}{currency}{end_punctuation}] [{start_punctuation}] [{numeric}]
|
[{lower_accented}{upper_accented}{currency}{numeric}{box}] [{end_punctuation}] [{start_punctuation}]
|
[{currency}{numeric}{box}] [{start_punctuation}]
|
[a-z] [{upper_accented}] [{start_punctuation}{currency}]
|
[{box}] [{kaomoji}]
|
[{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}] [{box}]
|
[{box}] [{end_punctuation}]
|
[{lower_accented}{upper_accented}] [{end_punctuation}] \w
|
# The ligature œ when not followed by an unaccented Latin letter
[Œœ][^A-Za-z]
|
# Common Windows-1252 2-character mojibake that isn't covered by the cases above
[ÂÃÎÐ][€Šš¢£Ÿž\xa0\xad®©°·»{start_punctuation}{end_punctuation}–—´]
|
× [²³]
|
# Windows-1252 mojibake of Arabic words needs to include the 'common' characters.
# To compensate, we require four characters to be matched.
[ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»]
[ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»]
|
# Windows-1252 mojibake that starts 3-character sequences for some South Asian
# alphabets
à[²µ¹¼½¾]
|
# MacRoman mojibake that isn't covered by the cases above
√[±∂†≠®™´≤≥¥µø]
|
≈[°¢]
|
‚Ä[ìîïòôúùû†°¢π]
|
‚[âó][àä°ê]
|
# Windows-1251 mojibake of characters in the U+2000 range
вЂ
|
# Windows-1251 mojibake of Latin-1 characters and/or the Cyrillic alphabet.
# Because the 2-character sequences involved here may be common, we require
# seeing a 3-character sequence.
[ВГРС][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°µ][ВГРС]
|
# A distinctive five-character sequence of Cyrillic letters, which can be
# Windows-1251 mojibake on top of Latin-1 mojibake of Windows-1252 characters.
# Require a Latin letter nearby.
ГўВЂВ.[A-Za-z ]
|
# Windows-1252 encodings of 'à' and 'á', as well as \xa0 itself
Ã[\xa0¡]
|
[a-z]\s?[ÃÂ][ ]
|
^[ÃÂ][ ]
|
# Cases where  precedes a character as an encoding of exactly the same
# character, and the character is common enough
[a-z.,?!{end_punctuation}] Â [ {start_punctuation}{end_punctuation}]
|
# Windows-1253 mojibake of characters in the U+2000 range
β€[™\xa0Ά\xad®°]
|
# Windows-1253 mojibake of Latin-1 characters and/or the Greek alphabet
[ΒΓΞΟ][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°][ΒΓΞΟ]
""".format(
**MOJIBAKE_CATEGORIES
),
re.VERBOSE,
)
def badness(text):
"""
Get the 'badness' of a sequence of text, counting the number of unlikely
character sequences. A badness greater than 0 indicates that some of it
seems to be mojibake.
"""
return len(BADNESS_RE.findall(text))
# Writing a function to apply to the dataset to detect Mojibakes
def is_mojibake_impacted(text):
if not badness.sequence_weirdness(text):
# nothing weird, should be okay
return True
try:
text.encode('sloppy-windows-1252')
except UnicodeEncodeError:
# Not CP-1252 encodable, probably fine
return True
else:
# Encodable as CP-1252, Mojibake alert level high
return False
# Taking an example of row# 8471 Short Desc and fix it
from ftfy import fix_text
print('Grabled text: \033[1m%s\033[0m\nFixed text: \033[1m%s\033[0m' % (df['Short description'][8471],
fix_text(df['Short description'][8471])))
Grabled text: 电脑开机开ä¸å‡ºæ¥ Fixed text: 电脑开机开不出来
# # Sanitizing the dataset from Mojibakes
# df['Short description'] = df['Short description'].apply(fix_text)
# df['Description'] = df['Description'].apply(fix_text)
# Write a function to apply to the dataset to detect Mojibakes
def is_mojibake_impacted(text):
if not badness.sequence_weirdness(text):
# nothing weird, should be okay
return True
try:
text.encode('sloppy-windows-1252')
except UnicodeEncodeError:
# Not CP-1252 encodable, probably fine
return True
else:
# Encodable as CP-1252, Mojibake alert level high
return False
# Take an example of row# 8471 Short Desc and fix it
print('Grabled text: \033[1m%s\033[0m\nFixed text: \033[1m%s\033[0m' % (df['Short description'][8471],
fix_text(df['Short description'][8471])))
Grabled text: 电脑开机开ä¸å‡ºæ¥ Fixed text: 电脑开机开不出来
# Serialize the mojibake treated dataset
df.to_csv('mojibake_treated.csv', index=False, encoding='utf_8_sig')
with open('mojibake_treated.pkl', 'wb') as handle:
pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)
Comments:
Language Translation (Goslate: Free Google Translate API)
Goslate is an open source python library that implemented Google Translate API. This uses the Google Translate Ajax API to make calls to such methods as detect and translate. It is choosen over another library Googletrans from Google as Goslate is developed to bypass the ticketing mechanism to prevent simple crawler program to access the Ajax API. Hence Goslate with multiple service urls is able to translate the entire dataset in very few iterations without blocking the user's IP address.
# Define and construct the service urls
svc_domains = ['.com','.com.au','.com.ar','.co.kr','.co.in','.co.jp','.at','.de','.ru','.ch','.fr','.es','.ae']
svc_urls = ['http://translate.google' + domain for domain in svc_domains]
# # Take an example of row# 8471 Short Desc and fix it
# gs = Goslate(service_urls=svc_urls)
# trans_8471 = gs.translate(ticket['Short description'][8471], target_language='en', source_language='auto')
# print('Original text: \033[1m%s\033[0m\nFixed text: \033[1m%s\033[0m' % (ticket['Short description'][8471], trans_8471))
print('Original text: \033[1m%s\033[0m\nFixed text: \033[1m%s\033[0m' % ('电脑开机开不出来', 'Boot the computer does not really come out'))
Original text: 电脑开机开不出来 Fixed text: Boot the computer does not really come out
# # List of column data to consider for translation
# trans_cols = ['Short description','Description']
# # Add a new column to store the detected language
# df.insert(loc=2, column='Language', value=np.nan, allow_duplicates=True)
# for idx in range(df.shape[0]):
# # Instantiate Goslate class in each iteration
# gs = Goslate(service_urls=svc_urls)
# lang = gs.detect(' '.join(df.loc[idx, trans_cols].tolist()))
# row_iter = gs.translate(df.loc[idx, trans_cols].tolist(),
# target_language='en',
# source_language='auto')
# df.loc[idx, trans_cols] = list(row_iter)
# df.Language = lang
# df.head()
# Serialize the translated dataset
df.to_csv('translated_ticket.csv', index=False, encoding='utf_8_sig')
with open('translated_ticket.pkl','wb') as f:
pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)
# Load the translated pickle file incase the IP gets blocked
with open('translated_ticket.pkl','rb') as f:
df = pickle.load(f)
Comments:
Unless paid service is used, Google blocks repetative hits to its Ajax API either via Googletrans or Goslate after certain iterations by cloagging the IP address. Using these list of various domains of translation API as service urls helped the traffic being patched among themselves, in turn allowing a longer buffer before the IP gets blocked.
Text Preprocessing
Text preprocessing is the process of transferring text from human language to machine-readable format for further processing. After a text is obtained, we start with text normalization. Text normalization includes:
# Define regex patterns
EMAIL_PATTERN = r"([\w.+-]+@[a-z\d-]+\.[a-z\d.-]+)"
PUNCT_PATTERN = r"[,|@|\|?|\\|$&*|%|\r|\n|.:|\s+|/|//|\\|/|\||-|<|>|;|(|)|=|+|#|-|\"|[-\]]|{|}]"
# Negative Lookbehind for EmailId replacement- Don't match any number which follows the text "RetainedEmailId"
NUMER_PATTERN = r"(?<!RetainedEmailId)(\d+(?:\.\d+)?)"
# Define a function to treat the texts
def cleanseText(text):
# Make the text unicase (lower)
text = str(text).lower()
# Remove email adresses
# text = re.sub(EMAIL_PATTERN, '', text, flags=re.IGNORECASE)
# Save Email addresses and replace them with custom keyword
email_dict = extract_email(text)
for key in email_dict.keys():
text = text.replace(email_dict[key], key)
# Remove all numbers
text = re.sub(NUMER_PATTERN, '', text)
# Replace all punctuations with blank space
# text = re.sub(PUNCT_PATTERN, " ", text, flags=re.MULTILINE)
text = text.translate(str.maketrans("","", string.punctuation))
text = re.sub(r'\s+', ' ', text)
# Replace multiple spaces from prev step to single
text = re.sub(r' {2,}', " ", text, flags=re.MULTILINE)
text = text.replace('`',"'")
# Replace the email ids back into their original position
for key in email_dict.keys():
text = text.replace(key, email_dict[key])
return text.strip()
def extract_email(text):
# Replaces the email addresses with custom key word and
# save them into a dictionary for future use
unique_emailid = set(re.findall(EMAIL_PATTERN, text))
email_replacement = dict()
for idx, email in enumerate(unique_emailid):
email_replacement[f'RetainedEmailId{idx}'] = email
return email_replacement
# Take an example of row# 32 Description and fix it
print('\033[1mOriginal text:\033[0m')
print(df['Description'][32])
print('_'*100)
print('\033[1mCleaned text:\033[0m')
print(cleanseText(df['Description'][32]))
Original text: received from: kxsceyzo.naokumlb@gmail.com gentles, i have two devices that are trying to share an ip address. they are trying to share 96.26.27.9619. one is a printer with the hostname of prtjc0074, and the other is a new display for erp. the display is using dhcp to get its address assigned and the printer is hard coded. my guess is that the address 96.26.27.9619 did not get set to a static address in dhcp. i need this corrected so the display will pick up another address. ____________________________________________________________________________________________________ Cleaned text: received from kxsceyzo.naokumlb@gmail.com gentles i have two devices that are trying to share an ip address they are trying to share one is a printer with the hostname of prtjc and the other is a new display for erp the display is using dhcp to get its address assigned and the printer is hard coded my guess is that the address did not get set to a static address in dhcp i need this corrected so the display will pick up another address
# Apply the cleaning function to entire dataset
df['Description'] = df['Description'].apply(cleanseText)
df['Short description'] = df['Short description'].apply(cleanseText)
# Verify the data
df.tail()
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 8495 | emails not coming in from zz mail | xd xd received from avglmrts.vhqmtiua@gmail.co... | avglmrts vhqmtiua | GRP_29 |
| 8496 | telephonysoftware issue | telephonysoftware issue | rbozivdq gmlhrtvp | GRP_0 |
| 8497 | vip windows password reset for tifpdchb pedxruyf | vip windows password reset for tifpdchb pedxruyf | oybwdsgx oxyhwrfz | GRP_0 |
| 8498 | machine nã£o estã¡ funcionando | i am unable to access the machine utilities to... | ufawcgob aowhxjky | GRP_62 |
| 8499 | an mehreren pcs lassen sich verschiedene prgra... | an mehreren pcs lassen sich verschiedene prgra... | kqvbrspl jyzoklfx | GRP_49 |
Stemming and Lemmatization
!pip install spacy
Requirement already satisfied: spacy in /usr/local/lib/python3.7/dist-packages (2.2.4) Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.0) Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.21.6) Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy) (57.4.0) Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.4.1) Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.0.6) Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.9.1) Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (4.64.0) Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.5) Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.23.0) Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (7.4.0) Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (3.0.6) Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.7) Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.1.3) Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy) (4.11.3) Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.8.0) Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (4.2.0) Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2021.10.8) Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10) Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.3)
import spacy
import en_core_web_sm
# Initialize spacy 'en' medium model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# Define a function to lemmatize the descriptions
def lemmatizer(sentence):
# Parse the sentence using the loaded 'en' model object `nlp`
doc = nlp(sentence)
return " ".join([token.lemma_ for token in doc if token.lemma_ !='-PRON-'])
# Take an example of row# 43 Description and lemmatize it
print('\033[1mOriginal text:\033[0m')
print(df['Description'][43])
print('_'*100)
print('\033[1mLemmatized text:\033[0m')
print(lemmatizer(df['Description'][43]))
Original text: received from yisohglr.uvteflgb@gmail.com hi the printer printer is not working and needs a part replaced can you reroute the jobs in queue to printer printer wihuyjdo qpogfwkb has indicated that prqos needs a new part and it may not deliver for a few days so the inwarehousetools will need to print on printer for now this needs to be taken care of today since the inwarehousetools are printed and are picked up by an outside vendor at pm in usa on a daily basis please contact dkmcfreg anwmfvlgenkataramdntyana if you have questions about the jobs in queue for today ____________________________________________________________________________________________________ Lemmatized text: receive from yisohglr.uvteflgb@gmail.com hi the printer printer be not work and need a part replace can reroute the job in queue to printer printer wihuyjdo qpogfwkb have indicate that prqos need a new part and may not deliver for a few day so the inwarehousetool will need to print on printer for now this need to be take care of today since the inwarehousetool be print and be pick up by an outside vendor at pm in usa on a daily basis please contact dkmcfreg anwmfvlgenkataramdntyana if have question about the job in queue for today
# Apply the Lemmatization to entire dataset
df['Description'] = df['Description'].apply(lemmatizer)
df['Short description'] = df['Short description'].apply(lemmatizer)
# Verify the data
df.tail()
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 8495 | email not come in from zz mail | xd xd receive from avglmrts.vhqmtiua@gmail.com... | avglmrts vhqmtiua | GRP_29 |
| 8496 | telephonysoftware issue | telephonysoftware issue | rbozivdq gmlhrtvp | GRP_0 |
| 8497 | vip windows password reset for tifpdchb pedxruyf | vip windows password reset for tifpdchb pedxruyf | oybwdsgx oxyhwrfz | GRP_0 |
| 8498 | machine nã£o estã ¡ funcionando | i be unable to access the machine utility to f... | ufawcgob aowhxjky | GRP_62 |
| 8499 | an mehreren pcs lassen sich verschiedene prgra... | an mehreren pcs lassen sich verschiedene prgra... | kqvbrspl jyzoklfx | GRP_49 |
# Serialize the preprocessed dataset
df.to_csv('preprocessed_ticket.csv', index=False, encoding='utf_8_sig')
with open('preprocessed_ticket.pkl','wb') as f:
pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)
# Create new features of length and word count for both of the description columns
df.insert(1, 'sd_len', df['Short description'].astype(str).apply(len))
df.insert(2, 'sd_word_count', df['Short description'].apply(lambda x: len(str(x).split())))
df.insert(4, 'desc_len', df['Description'].astype(str).apply(len))
df.insert(5, 'desc_word_count', df['Description'].apply(lambda x: len(str(x).split())))
df.head()
| Short description | sd_len | sd_word_count | Description | desc_len | desc_word_count | Caller | Assignment group | |
|---|---|---|---|---|---|---|---|---|
| 0 | login issue | 11 | 2 | verify user detailsemployee manager namexd che... | 183 | 31 | spxjnwir pjlcoqds | GRP_0 |
| 1 | outlook | 7 | 1 | xd xd receive from hmjdrvpb.komuaywn@gmail.com... | 184 | 28 | hmjdrvpb komuaywn | GRP_0 |
| 2 | can not log in to vpn | 21 | 6 | xd xd receive from eylqgodm.ybqkwiam@gmail.com... | 93 | 17 | eylqgodm ybqkwiam | GRP_0 |
| 3 | unable to access hrtool page | 28 | 5 | unable to access hrtool page | 28 | 5 | xbkucsvz gcpydteq | GRP_0 |
| 4 | skype error | 11 | 2 | skype error | 11 | 2 | owlgqjme qhcozdfx | GRP_0 |
Exploratory Data Analysis
Exploratory Data Analysis (EDA) is an approach/philosophy for data analysis that employs a variety of techniques (mostly graphical) to
Visually representing the content of a text document is one of the most important tasks in the field of text mining. It helps not only to explore the content of documents from different aspects and at different levels of details, but also helps in summarizing a single document, show the words and topics, detect events, and create storylines.
We'll be using plotly library to generate the graphs and visualizations. We need cufflinks to link plotly to pandas dataframe and add the iplot method
!pip install plotly cufflinks
Requirement already satisfied: plotly in /usr/local/lib/python3.7/dist-packages (5.5.0) Requirement already satisfied: cufflinks in /usr/local/lib/python3.7/dist-packages (0.17.3) Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.7/dist-packages (from plotly) (8.0.1) Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from plotly) (1.15.0) Requirement already satisfied: setuptools>=34.4.1 in /usr/local/lib/python3.7/dist-packages (from cufflinks) (57.4.0) Requirement already satisfied: numpy>=1.9.2 in /usr/local/lib/python3.7/dist-packages (from cufflinks) (1.21.6) Requirement already satisfied: ipywidgets>=7.0.0 in /usr/local/lib/python3.7/dist-packages (from cufflinks) (7.7.0) Requirement already satisfied: ipython>=5.3.0 in /usr/local/lib/python3.7/dist-packages (from cufflinks) (5.5.0) Requirement already satisfied: colorlover>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from cufflinks) (0.3.0) Requirement already satisfied: pandas>=0.19.2 in /usr/local/lib/python3.7/dist-packages (from cufflinks) (1.3.5) Requirement already satisfied: pickleshare in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (0.7.5) Requirement already satisfied: pexpect in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (4.8.0) Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (5.2.1.post0) Requirement already satisfied: decorator in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (4.4.2) Requirement already satisfied: prompt-toolkit<2.0.0,>=1.0.4 in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (1.0.18) Requirement already satisfied: simplegeneric>0.8 in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (0.8.1) Requirement already satisfied: pygments in /usr/local/lib/python3.7/dist-packages (from ipython>=5.3.0->cufflinks) (2.6.1) Requirement already satisfied: jupyterlab-widgets>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.0.0->cufflinks) (1.1.0) Requirement already satisfied: ipython-genutils~=0.2.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.0.0->cufflinks) (0.2.0) Requirement already satisfied: nbformat>=4.2.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.0.0->cufflinks) (5.4.0) Requirement already satisfied: widgetsnbextension~=3.6.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.0.0->cufflinks) (3.6.0) Requirement already satisfied: ipykernel>=4.5.1 in /usr/local/lib/python3.7/dist-packages (from ipywidgets>=7.0.0->cufflinks) (4.10.1) Requirement already satisfied: jupyter-client in /usr/local/lib/python3.7/dist-packages (from ipykernel>=4.5.1->ipywidgets>=7.0.0->cufflinks) (5.3.5) Requirement already satisfied: tornado>=4.0 in /usr/local/lib/python3.7/dist-packages (from ipykernel>=4.5.1->ipywidgets>=7.0.0->cufflinks) (5.1.1) Requirement already satisfied: fastjsonschema in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (2.15.3) Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (4.3.3) Requirement already satisfied: jupyter-core in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (4.10.0) Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (21.4.0) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (4.2.0) Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (4.11.3) Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (5.7.1) Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (0.18.1) Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from importlib-resources>=1.4.0->jsonschema>=2.6->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (3.8.0) Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19.2->cufflinks) (2.8.2) Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19.2->cufflinks) (2022.1) Requirement already satisfied: wcwidth in /usr/local/lib/python3.7/dist-packages (from prompt-toolkit<2.0.0,>=1.0.4->ipython>=5.3.0->cufflinks) (0.2.5) Requirement already satisfied: notebook>=4.4.1 in /usr/local/lib/python3.7/dist-packages (from widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (5.3.1) Requirement already satisfied: jinja2 in /usr/local/lib/python3.7/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (2.11.3) Requirement already satisfied: nbconvert in /usr/local/lib/python3.7/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (5.6.1) Requirement already satisfied: terminado>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.13.3) Requirement already satisfied: Send2Trash in /usr/local/lib/python3.7/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (1.8.0) Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.7/dist-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets>=7.0.0->cufflinks) (22.3.0) Requirement already satisfied: ptyprocess in /usr/local/lib/python3.7/dist-packages (from terminado>=0.8.1->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.7.0) Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (2.0.1) Requirement already satisfied: defusedxml in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.7.1) Requirement already satisfied: testpath in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.6.0) Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (1.5.0) Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.8.4) Requirement already satisfied: bleach in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (5.0.0) Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.7/dist-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.4) Requirement already satisfied: webencodings in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.0.0->cufflinks) (0.5.1)
print('Plotly:', py.__version__)
print('Cufflinks:', cf.__version__)
Plotly: 5.5.0 Cufflinks: 0.17.3
Univariate visualization
Single-variable or univariate visualization is the simplest type of visualization which consists of observations on only a single characteristic or attribute. Univariate visualization includes histogram, bar plots and line charts.
import plotly.io as pio
pio.renderers.default = "colab"
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', df['Assignment group'].nunique())
# Histogram
df['Assignment group'].iplot(
kind='hist',
xTitle='Assignment Group',
yTitle='count',
title='Assignment Group Distribution- Histogram (Fig-1)')
Total assignment groups: 74
# Pie chart
assgn_grp = pd.DataFrame(df.groupby('Assignment group').size(),columns = ['Count']).reset_index()
assgn_grp.iplot(
kind='pie',
labels='Assignment group',
values='Count',
title='Assignment Group Distribution- Pie Chart (Fig-2)',
hoverinfo="label+percent+name", hole=0.25)
# Bar plot
df['Assignment group'].iplot(
kind='bar',
yTitle='Assignment Group',
xTitle='Record #',
colorscale='-plotly',
title='Assignment Group Distribution- Bar Chart (Fig-3)')
# Find out the Assignment Groups with less than equal to 30 tickets assigned
rare_ticket = df.groupby(['Assignment group']).filter(lambda x: len(x) <= 30)
print('\033[1m#Groups with less than equal to 30 tickets assigned:\033[0m', rare_ticket['Assignment group'].nunique())
rare_ticket['Assignment group'].iplot(
kind='hist',
xTitle='Assignment Group',
yTitle='count',
colorscale='-orrd',
title='#Records by rare Assignment Groups- Histogram (Fig-4)')
#Groups with less than equal to 30 tickets assigned: 40
# Distribution of Assignment groups excluding GRP_0 & rare groups (groups with less than equal 30 tickets assigned)
excluded_grp = ['GRP_0']
excluded_grp.extend(rare_ticket['Assignment group'].unique())
filtered_tkt = df[~df['Assignment group'].isin(excluded_grp)]
# Pie chart
filtered_assgn_grp = pd.DataFrame(filtered_tkt.groupby('Assignment group').size(),columns = ['Count']).reset_index()
filtered_assgn_grp.iplot(
kind='pie',
labels='Assignment group',
values='Count',
title='#Records by Assignment groups(excluding GRP_0 and rare groups)- Pie Chart (Fig-5)',
pull=np.linspace(0,0.3,filtered_assgn_grp['Assignment group'].nunique()))
# Histogram
filtered_tkt['Assignment group'].iplot(
kind='histogram',
xTitle='Assignment Group',
yTitle='count',
colorscale='-gnbu',
title='#Records by Assignment groups(excluding GRP_0 and rare groups)- Histogram (Fig-6)')
Comments:
The distribution of Callers
# Find out top 10 callers in terms of frequency of raising tickets in the entire dataset
print('\033[1mTotal caller count:\033[0m', df['Caller'].nunique())
df1 = pd.DataFrame(df.groupby(['Caller']).size().nlargest(10), columns=['Count']).reset_index()
df1.iplot(kind='pie',
labels='Caller',
values='Count',
title='Top 10 caller- Pie Chart (Fig-7)',
colorscale='-spectral',
pull=[0,0,0,0,0.05,0.1,0.15,0.2,0.25,0.3])
Total caller count: 2950
# Top 5 callers in each assignment group
top_n = 5
s = df['Caller'].groupby(df['Assignment group']).value_counts()
caller_grp = pd.DataFrame(s.groupby(level=0).nlargest(top_n).reset_index(level=0, drop=True))
caller_grp.head(15)
| Caller | ||
|---|---|---|
| Assignment group | Caller | |
| GRP_0 | fumkcsji sarmtlhy | 132 |
| rbozivdq gmlhrtvp | 86 | |
| olckhmvx pcqobjnd | 54 | |
| efbwiadp dicafxhv | 45 | |
| mfeyouli ndobtzpw | 13 | |
| GRP_1 | bpctwhsn kzqsbmtp | 6 |
| jloygrwh acvztedi | 4 | |
| jyoqwxhz clhxsoqy | 3 | |
| spxqmiry zpwgoqju | 3 | |
| kbnfxpsy gehxzayq | 2 | |
| GRP_10 | bpctwhsn kzqsbmtp | 60 |
| ihfkwzjd erbxoyqk | 6 | |
| dizquolf hlykecxa | 5 | |
| gnasmtvx cwxtsvkm | 3 | |
| hlrmufzx qcdzierm | 3 |
# Visualize Top 5 callers in each of top 10 assignment groups
top_n = 10
top_grps = assgn_grp.nlargest(top_n, 'Count')['Assignment group'].tolist()
fig_cols = 5
fig_rows = int(np.ceil(top_n/fig_cols))
fig, axes = plt.subplots(fig_rows, fig_cols, figsize=(13,9.5))
fig.suptitle('Top 5 callers in each of top 10 assignment groups- Pie Chart (Fig-8)', y=1, va= 'bottom', size='20')
for row in range(fig_rows):
for col in range(fig_cols):
grp_n = fig_cols * row + col
if grp_n < top_n:
xs = caller_grp.xs(top_grps[grp_n])
_ = axes[row,col].pie(xs, autopct='%1.1f%%', explode=[0.05]*5)
axes[row,col].legend(labels=xs.index,loc="best")
axes[row,col].axis('equal')
axes[row,col].set_title(top_grps[grp_n])
plt.tight_layout()
# Check if any caller appears to raise ticket for multiple groups
mul_caller = caller_grp[caller_grp.Caller.duplicated()]
uni_mul_caller = [idx[1] for idx in mul_caller.index[mul_caller.Caller.unique()]]
print(f'\033[1mFollowing {len(uni_mul_caller)} callers happen to raise tickets for multiple groups:\033[0m\n')
print(uni_mul_caller)
mul_caller
Following 15 callers happen to raise tickets for multiple groups:
['hlrmufzx qcdzierm', 'fbgetczn jlsvxura', 'gnasmtvx cwxtsvkm', 'ihfkwzjd erbxoyqk', 'tqfnalpj qyoscnge', 'fmqubnvs kcxpeyiv', 'tghrloks jbgcvlmf', 'jwqyxbzs adpvilqu', 'nuhfwplj ojcwxser', 'oldrctiu bxurpsyi', 'vlymsnej whlqxcst', 'dkmcfreg anwmfvlg', 'bpctwhsn kzqsbmtp', 'spxqmiry zpwgoqju', 'obanjrhg rnafleys']
| Caller | ||
|---|---|---|
| Assignment group | Caller | |
| GRP_1 | spxqmiry zpwgoqju | 3 |
| GRP_10 | ihfkwzjd erbxoyqk | 6 |
| gnasmtvx cwxtsvkm | 3 | |
| hlrmufzx qcdzierm | 3 | |
| GRP_11 | tghrloks jbgcvlmf | 2 |
| ... | ... | ... |
| GRP_73 | kcnosyae zlpmfxgs | 1 |
| GRP_8 | ZkBogxib QsEJzdZO | 54 |
| GRP_9 | ctzykflo evzbhgru | 3 |
| sholvcmf bjtpomrl | 3 | |
| urhpnlaf agmsfqil | 3 |
281 rows × 1 columns
Comments:
The distribution of Short description lengths
# Serialize the preprocessed dataset
df.to_csv('preprocessed_ticket.csv', index=False, encoding='utf_8_sig')
with open('preprocessed_ticket.pkl','wb') as f:
pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)
# Short Desc text length
df['sd_len'].iplot(
kind='scatter',
xTitle='text length',
yTitle='count',
title='Short Desc. Text Length Distribution (Fig-9)')
# Short desc word count
df['sd_word_count'].iplot(
kind='hist',
bins=100,
xTitle='word count',
linecolor='black',
yTitle='count',
colorscale='pastel1',
title='Short desc. Word Count Distribution (Fig-10)')
The distribution of Description lengths
# Description text length
df['desc_len'].iplot(
kind='bar',
xTitle='text length',
yTitle='count',
colorscale='-ylgn',
title='Description Text Length Distribution (Fig-11)')
# Description word count
df['desc_word_count'].iplot(
kind='bar',
xTitle='word count',
linecolor='black',
yTitle='count',
colorscale='-bupu',
title='Description Word Count Distribution (Fig-12)')
# Merge the Short descrition and Description column texts to create a new column
df.insert(loc=8,
column='Summary',
allow_duplicates=True,
value=list(df['Short description'].str.strip() + ' ' + df['Description'].str.strip()))
# Extend the English Stop Wordss
STOP_WORDS = STOPWORDS.union({'yes','na','hi',
'receive','hello',
'regards','thanks',
'from','greeting',
'forward','reply',
'will','please',
'see','help','able'})
# Generic function to derive top N n-grams from the corpus
def get_top_n_ngrams(corpus, top_n=None, ngram_range=(1,1), stopwords=None):
vec = CountVectorizer(ngram_range=ngram_range,
stop_words=stopwords).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:top_n]
# Top 50 Unigrams before removing stop words
top_n = 50
ngram_range = (1,1)
uni_grams = get_top_n_ngrams(df.Summary, top_n, ngram_range)
df1 = pd.DataFrame(uni_grams, columns = ['Summary' , 'count'])
df1.groupby('Summary').sum()['count'].sort_values(ascending=False).iplot(
kind='bar',
yTitle='Count',
linecolor='black',
colorscale='piyg',
title=f'Top {top_n} Unigrams in Summary')
# Top 50 Unigrams after removing stop words
uni_grams_sw = get_top_n_ngrams(df.Summary, top_n, ngram_range, stopwords=STOP_WORDS)
df1 = pd.DataFrame(uni_grams_sw, columns = ['Summary' , 'count'])
df1.groupby('Summary').sum()['count'].sort_values(ascending=False).iplot(
kind='bar',
yTitle='Count',
linecolor='black',
colorscale='-piyg',
title=f'Top {top_n} Unigrams in Summary without stop words')
def generate_word_clod(corpus):
# mask = np.array(Image.open('cloud2.png'))
# Instantiate the wordcloud object
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords=STOP_WORDS,
# mask=mask,
min_font_size = 10).generate(corpus)
# plot the WordCloud image
plt.figure(figsize = (12, 12), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
# Word Cloud for all tickets assigned to GRP_0
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_0'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_8
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_8'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_24
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_24'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_12
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_12'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_9
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_9'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_2
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_2'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_19
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_19'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_3
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_3'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_6
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_6'].Summary.str.strip()))
# Word Cloud for all tickets assigned to GRP_13
generate_word_clod(' '.join(df[df['Assignment group'] == 'GRP_13'].Summary.str.strip()))
# Generate wordcloud for ticket Short description
generate_word_clod(' '.join(df['Short description'].str.strip()))
# Generate wordcloud for ticket Description
generate_word_clod(' '.join(df.Description.str.strip()))
# Generate wordcloud for ticket Summary
generate_word_clod(' '.join(df.Summary.str.strip()))
# Serialize the dataset after EDA
with open('model_ready.pkl','wb') as f:
pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)
Comments:
job fail (897 times)
Analysis on GRP_0 which is the most frequent group to assign a ticket to reveals that this group deals with mostly the maintenance problems such as password reset, account lock, login issue, ticket update etc.
Model Building
Let's proceed towards trying different model architectures mentioned below to classify the problem and validate which one is outperforming.
Let's create another column of categorical datatype from Assignment groups. Let's write some generic methods for utilities and to plot evaluation metrics.
# Create a target categorical column
df['target'] = df['Assignment group'].astype('category').cat.codes
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8500 entries, 0 to 8499 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Short description 8500 non-null object 1 sd_len 8500 non-null int64 2 sd_word_count 8500 non-null int64 3 Description 8500 non-null object 4 desc_len 8500 non-null int64 5 desc_word_count 8500 non-null int64 6 Caller 8500 non-null object 7 Assignment group 8500 non-null object 8 Summary 8500 non-null object 9 target 8500 non-null int8 dtypes: int64(4), int8(1), object(5) memory usage: 606.1+ KB
# A class that logs the time
class Timer():
'''
A generic class to log the time
'''
def __init__(self):
self.start_ts = None
def start(self):
self.start_ts = time()
def stop(self):
return 'Time taken: %2fs' % (time()-self.start_ts)
timer = Timer()
# A method that plots the Precision-Recall curve
def plot_prec_recall_vs_thresh(precisions, recalls, thresholds):
plt.figure(figsize=(10,5))
plt.plot(thresholds, precisions[:-1], 'b--', label='precision')
plt.plot(thresholds, recalls[:-1], 'g--', label = 'recall')
plt.xlabel('Threshold')
plt.legend()
# A method to train and test the model
def run_classification(estimator, X_train, X_test, y_train, y_test, arch_name=None, pipelineRequired=True, isDeepModel=False):
timer.start()
# train the model
clf = estimator
if pipelineRequired :
clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', estimator),
])
if isDeepModel :
clf.fit(X_train, y_train, validation_data=(X_test, y_test),epochs=10, batch_size=128,verbose=1,callbacks=call_backs(arch_name))
# predict from the claffier
y_pred = clf.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_train_pred = clf.predict(X_train)
y_train_pred = np.argmax(y_train_pred, axis=1)
else :
clf.fit(X_train, y_train)
# predict from the claffier
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)
print('Estimator:', clf)
print('='*80)
print('Training accuracy: %.2f%%' % (accuracy_score(y_train,y_train_pred) * 100))
print('Testing accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
print('='*80)
print('Confusion matrix:\n %s' % (confusion_matrix(y_test, y_pred)))
print('='*80)
print('Classification report:\n %s' % (classification_report(y_test, y_pred)))
print(timer.stop(), 'to run the model')
# Create training and test datasets with 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(df.Summary,
df.target,
test_size=0.20,
random_state=42)
print('\033[1mShape of the training set:\033[0m', X_train.shape, X_test.shape)
print('\033[1mShape of the test set:\033[0m', y_train.shape, y_test.shape)
Shape of the training set: (6800,) (1700,) Shape of the test set: (6800,) (1700,)
Naive Bayes Classifier
Naive Bayes is a simple technique for constructing classifiers: models that assign class labels to problem instances, represented as vectors of feature values, where the class labels are drawn from some finite set. There is not a single algorithm for training such classifiers, but a family of algorithms based on a common principle: all naive Bayes classifiers assume that the value of a particular feature is independent of the value of any other feature, given the class variable.
Advantages:
Disadvantages:
run_classification(MultinomialNB(), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', MultinomialNB())])
================================================================================
Training accuracy: 56.93%
Testing accuracy: 53.24%
================================================================================
Confusion matrix:
[[761 0 0 ... 0 0 0]
[ 3 0 0 ... 0 2 0]
[ 15 0 0 ... 0 9 0]
...
[ 1 0 0 ... 0 0 0]
[ 15 0 0 ... 0 106 0]
[ 18 0 0 ... 0 38 0]]
================================================================================
Classification report:
precision recall f1-score support
0 0.53 1.00 0.69 761
1 0.00 0.00 0.00 8
2 0.00 0.00 0.00 24
3 0.00 0.00 0.00 5
4 0.64 0.21 0.32 42
5 0.00 0.00 0.00 26
6 0.00 0.00 0.00 20
7 0.00 0.00 0.00 8
8 0.00 0.00 0.00 20
9 0.00 0.00 0.00 17
10 0.00 0.00 0.00 18
11 0.00 0.00 0.00 58
12 0.00 0.00 0.00 51
13 0.00 0.00 0.00 5
14 0.00 0.00 0.00 7
15 0.00 0.00 0.00 5
16 0.00 0.00 0.00 3
17 1.00 0.40 0.57 72
18 0.00 0.00 0.00 20
19 0.00 0.00 0.00 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 0.00 0.00 0.00 20
23 0.00 0.00 0.00 42
24 0.00 0.00 0.00 6
25 0.00 0.00 0.00 24
27 0.00 0.00 0.00 14
28 0.00 0.00 0.00 12
29 0.00 0.00 0.00 1
30 0.00 0.00 0.00 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 0.00 0.00 0.00 6
34 0.00 0.00 0.00 26
35 0.00 0.00 0.00 8
36 0.00 0.00 0.00 10
37 0.00 0.00 0.00 8
39 0.00 0.00 0.00 1
40 0.00 0.00 0.00 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 0.00 0.00 0.00 6
45 0.00 0.00 0.00 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 0.00 0.00 0.00 1
56 0.00 0.00 0.00 46
57 0.00 0.00 0.00 5
59 0.00 0.00 0.00 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.00 0.00 0.00 18
70 0.00 0.00 0.00 1
72 0.48 0.88 0.62 121
73 0.00 0.00 0.00 56
accuracy 0.53 1700
macro avg 0.04 0.04 0.04 1700
weighted avg 0.33 0.53 0.39 1700
Time taken: 1.206412s to run the model
pickle.dump(MultinomialNB(), open('MNBmodel.pkl','wb'))
K-nearest Neighbor
run_classification(KNeighborsClassifier(), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', KNeighborsClassifier())])
================================================================================
Training accuracy: 70.19%
Testing accuracy: 62.82%
================================================================================
Confusion matrix:
[[742 0 0 ... 0 0 0]
[ 2 0 0 ... 0 2 0]
[ 10 0 9 ... 0 0 1]
...
[ 1 0 0 ... 0 0 0]
[ 4 0 2 ... 0 97 8]
[ 15 0 0 ... 0 28 10]]
================================================================================
Classification report:
precision recall f1-score support
0 0.63 0.98 0.77 761
1 0.00 0.00 0.00 8
2 0.75 0.38 0.50 24
3 1.00 0.40 0.57 5
4 0.48 0.36 0.41 42
5 0.44 0.27 0.33 26
6 0.50 0.25 0.33 20
7 0.00 0.00 0.00 8
8 0.50 0.05 0.09 20
9 1.00 0.59 0.74 17
10 0.50 0.28 0.36 18
11 0.53 0.16 0.24 58
12 0.67 0.39 0.49 51
13 0.00 0.00 0.00 5
14 0.00 0.00 0.00 7
15 0.00 0.00 0.00 5
16 0.00 0.00 0.00 3
17 0.97 0.81 0.88 72
18 0.50 0.15 0.23 20
19 1.00 0.08 0.14 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 0.80 0.20 0.32 20
23 0.60 0.14 0.23 42
24 0.29 0.67 0.40 6
25 0.44 0.17 0.24 24
27 0.57 0.29 0.38 14
28 1.00 0.08 0.15 12
29 0.00 0.00 0.00 1
30 1.00 0.50 0.67 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 0.33 0.17 0.22 6
34 0.00 0.00 0.00 26
35 0.50 0.12 0.20 8
36 1.00 0.20 0.33 10
37 1.00 0.38 0.55 8
39 0.00 0.00 0.00 1
40 0.50 0.08 0.14 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 1.00 0.33 0.50 6
45 0.86 0.43 0.57 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 0.00 0.00 0.00 1
54 0.00 0.00 0.00 0
56 0.74 0.54 0.62 46
57 0.00 0.00 0.00 5
59 0.33 0.20 0.25 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.33 0.11 0.17 18
70 0.00 0.00 0.00 1
72 0.58 0.80 0.67 121
73 0.48 0.18 0.26 56
accuracy 0.63 1700
macro avg 0.36 0.18 0.21 1700
weighted avg 0.60 0.63 0.56 1700
Time taken: 3.362777s to run the model
pickle.dump(KNeighborsClassifier(), open('KNNmodel.pkl','wb'))
Support Vector Machine (SVM)
# SVM with Linear kernel
run_classification(LinearSVC(), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', LinearSVC())])
================================================================================
Training accuracy: 93.40%
Testing accuracy: 68.35%
================================================================================
Confusion matrix:
[[711 0 0 ... 0 0 1]
[ 0 0 0 ... 0 1 0]
[ 4 0 14 ... 0 1 0]
...
[ 1 0 0 ... 0 0 0]
[ 0 1 2 ... 0 107 1]
[ 7 0 0 ... 0 35 10]]
================================================================================
Classification report:
precision recall f1-score support
0 0.75 0.93 0.83 761
1 0.00 0.00 0.00 8
2 0.64 0.58 0.61 24
3 0.67 0.40 0.50 5
4 0.49 0.60 0.54 42
5 0.56 0.54 0.55 26
6 0.45 0.45 0.45 20
7 0.50 0.12 0.20 8
8 0.27 0.15 0.19 20
9 0.88 0.88 0.88 17
10 0.56 0.28 0.37 18
11 0.50 0.24 0.33 58
12 0.54 0.49 0.52 51
13 1.00 0.20 0.33 5
14 0.67 0.29 0.40 7
15 0.33 0.20 0.25 5
16 0.75 1.00 0.86 3
17 0.94 0.92 0.93 72
18 0.79 0.55 0.65 20
19 0.50 0.08 0.13 13
20 0.00 0.00 0.00 2
21 1.00 0.12 0.22 8
22 0.55 0.60 0.57 20
23 0.43 0.36 0.39 42
24 0.36 0.83 0.50 6
25 0.75 0.38 0.50 24
27 0.40 0.43 0.41 14
28 0.75 0.25 0.38 12
29 0.00 0.00 0.00 1
30 1.00 1.00 1.00 2
31 0.00 0.00 0.00 2
32 1.00 0.50 0.67 2
33 0.67 0.33 0.44 6
34 0.22 0.08 0.11 26
35 0.00 0.00 0.00 8
36 0.88 0.70 0.78 10
37 0.60 0.38 0.46 8
39 0.00 0.00 0.00 1
40 0.50 0.08 0.14 12
41 0.00 0.00 0.00 2
42 1.00 0.11 0.20 9
43 1.00 0.50 0.67 6
45 0.76 0.46 0.58 28
46 0.00 0.00 0.00 2
47 1.00 1.00 1.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 1.00 1.00 1.00 1
56 0.84 0.57 0.68 46
57 1.00 0.20 0.33 5
59 0.00 0.00 0.00 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.54 0.39 0.45 18
70 0.00 0.00 0.00 1
72 0.57 0.88 0.69 121
73 0.56 0.18 0.27 56
accuracy 0.68 1700
macro avg 0.49 0.34 0.37 1700
weighted avg 0.66 0.68 0.65 1700
Time taken: 1.075882s to run the model
pickle.dump(LinearSVC(), open('LinearSVCmodel.pkl','wb'))
# SVM with RBF kernel
run_classification(SVC(kernel='rbf'), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', SVC())])
================================================================================
Training accuracy: 82.47%
Testing accuracy: 62.29%
================================================================================
Confusion matrix:
[[753 0 0 ... 0 0 0]
[ 2 0 0 ... 0 1 0]
[ 14 0 9 ... 0 1 0]
...
[ 1 0 0 ... 0 0 0]
[ 4 0 2 ... 0 107 1]
[ 16 0 0 ... 0 35 3]]
================================================================================
Classification report:
precision recall f1-score support
0 0.60 0.99 0.75 761
1 0.00 0.00 0.00 8
2 0.82 0.38 0.51 24
3 0.00 0.00 0.00 5
4 0.50 0.43 0.46 42
5 0.55 0.23 0.32 26
6 1.00 0.25 0.40 20
7 0.00 0.00 0.00 8
8 0.00 0.00 0.00 20
9 1.00 0.88 0.94 17
10 0.67 0.11 0.19 18
11 1.00 0.03 0.07 58
12 0.71 0.39 0.51 51
13 0.00 0.00 0.00 5
14 1.00 0.14 0.25 7
15 0.00 0.00 0.00 5
16 0.00 0.00 0.00 3
17 0.95 0.82 0.88 72
18 0.86 0.30 0.44 20
19 0.00 0.00 0.00 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 0.33 0.05 0.09 20
23 0.67 0.05 0.09 42
24 0.50 0.17 0.25 6
25 0.80 0.17 0.28 24
27 0.60 0.21 0.32 14
28 0.00 0.00 0.00 12
29 0.00 0.00 0.00 1
30 1.00 0.50 0.67 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 1.00 0.17 0.29 6
34 0.00 0.00 0.00 26
35 0.00 0.00 0.00 8
36 0.00 0.00 0.00 10
37 0.00 0.00 0.00 8
39 0.00 0.00 0.00 1
40 0.00 0.00 0.00 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 1.00 0.17 0.29 6
45 0.76 0.46 0.58 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 0.00 0.00 0.00 1
56 0.85 0.50 0.63 46
57 0.00 0.00 0.00 5
59 0.00 0.00 0.00 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.50 0.17 0.25 18
70 0.00 0.00 0.00 1
72 0.58 0.88 0.70 121
73 0.60 0.05 0.10 56
accuracy 0.62 1700
macro avg 0.31 0.14 0.17 1700
weighted avg 0.59 0.62 0.53 1700
Time taken: 53.603992s to run the model
pickle.dump(SVC(kernel='rbf'), open('SVCrbfmodel.pkl','wb'))
Decision Tree
run_classification(DecisionTreeClassifier(), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', DecisionTreeClassifier())])
================================================================================
Training accuracy: 95.60%
Testing accuracy: 56.35%
================================================================================
Confusion matrix:
[[627 1 2 ... 0 1 3]
[ 0 0 0 ... 0 1 0]
[ 9 0 11 ... 0 0 1]
...
[ 1 0 0 ... 0 0 0]
[ 2 1 2 ... 0 89 7]
[ 9 0 0 ... 0 28 9]]
================================================================================
Classification report:
precision recall f1-score support
0 0.70 0.82 0.76 761
1 0.00 0.00 0.00 8
2 0.48 0.46 0.47 24
3 0.25 0.20 0.22 5
4 0.30 0.33 0.32 42
5 0.21 0.19 0.20 26
6 0.33 0.25 0.29 20
7 0.20 0.12 0.15 8
8 0.19 0.15 0.17 20
9 0.89 1.00 0.94 17
10 0.20 0.17 0.18 18
11 0.23 0.12 0.16 58
12 0.38 0.35 0.37 51
13 0.00 0.00 0.00 5
14 0.40 0.29 0.33 7
15 0.00 0.00 0.00 5
16 0.50 0.67 0.57 3
17 0.86 0.71 0.78 72
18 0.35 0.35 0.35 20
19 0.00 0.00 0.00 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 0.19 0.30 0.23 20
23 0.33 0.33 0.33 42
24 0.24 0.67 0.35 6
25 0.43 0.12 0.19 24
27 0.21 0.21 0.21 14
28 0.08 0.08 0.08 12
29 0.00 0.00 0.00 1
30 0.50 0.50 0.50 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 0.00 0.00 0.00 6
34 0.17 0.12 0.14 26
35 0.00 0.00 0.00 8
36 0.33 0.20 0.25 10
37 0.20 0.12 0.15 8
38 0.00 0.00 0.00 0
39 0.00 0.00 0.00 1
40 0.00 0.00 0.00 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 0.50 0.33 0.40 6
44 0.00 0.00 0.00 0
45 0.59 0.57 0.58 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 1.00 0.33 0.50 3
50 0.00 0.00 0.00 0
51 0.00 0.00 0.00 1
56 0.66 0.46 0.54 46
57 1.00 0.20 0.33 5
58 0.00 0.00 0.00 0
59 0.29 0.40 0.33 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.62 0.28 0.38 18
69 0.00 0.00 0.00 0
70 0.00 0.00 0.00 1
72 0.58 0.74 0.65 121
73 0.35 0.16 0.22 56
accuracy 0.56 1700
macro avg 0.23 0.19 0.19 1700
weighted avg 0.53 0.56 0.54 1700
Time taken: 2.278121s to run the model
pickle.dump(DecisionTreeClassifier(), open('DecisionTreemodel.pkl','wb'))
Random Forest
run_classification(RandomForestClassifier(n_estimators=100), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', RandomForestClassifier())])
================================================================================
Training accuracy: 95.60%
Testing accuracy: 62.41%
================================================================================
Confusion matrix:
[[755 0 0 ... 0 0 0]
[ 2 0 0 ... 0 1 0]
[ 15 0 8 ... 0 0 1]
...
[ 1 0 0 ... 0 0 0]
[ 4 1 2 ... 0 98 8]
[ 16 0 0 ... 0 28 10]]
================================================================================
Classification report:
precision recall f1-score support
0 0.60 0.99 0.75 761
1 0.00 0.00 0.00 8
2 0.80 0.33 0.47 24
3 0.00 0.00 0.00 5
4 0.61 0.40 0.49 42
5 0.40 0.08 0.13 26
6 0.75 0.15 0.25 20
7 0.00 0.00 0.00 8
8 0.00 0.00 0.00 20
9 1.00 0.88 0.94 17
10 1.00 0.11 0.20 18
11 1.00 0.03 0.07 58
12 0.83 0.39 0.53 51
13 0.00 0.00 0.00 5
14 1.00 0.14 0.25 7
15 0.00 0.00 0.00 5
16 0.00 0.00 0.00 3
17 0.91 0.86 0.89 72
18 0.80 0.20 0.32 20
19 0.00 0.00 0.00 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 1.00 0.10 0.18 20
23 1.00 0.10 0.17 42
24 0.27 0.67 0.38 6
25 1.00 0.17 0.29 24
27 0.60 0.21 0.32 14
28 0.50 0.08 0.14 12
29 0.00 0.00 0.00 1
30 1.00 1.00 1.00 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 0.00 0.00 0.00 6
34 0.75 0.12 0.20 26
35 0.00 0.00 0.00 8
36 1.00 0.10 0.18 10
37 1.00 0.12 0.22 8
39 0.00 0.00 0.00 1
40 0.00 0.00 0.00 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 1.00 0.33 0.50 6
45 0.72 0.46 0.57 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 0.00 0.00 0.00 1
56 0.84 0.46 0.59 46
57 1.00 0.20 0.33 5
59 0.00 0.00 0.00 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.00 0.00 0.00 18
70 0.00 0.00 0.00 1
72 0.59 0.81 0.68 121
73 0.45 0.18 0.26 56
accuracy 0.62 1700
macro avg 0.37 0.16 0.19 1700
weighted avg 0.62 0.62 0.54 1700
Time taken: 11.945270s to run the model
pickle.dump(RandomForestClassifier(n_estimators=100), open('RandomForestmodel.pkl','wb'))
Observations:
We'll be fine tuning the models and reduce the overfitting in next iteration.
Neural Network
#Path where you want to save the weights, model and checkpoints
model_path = "Weights/"
%mkdir Weights
# Define model callbacks
def call_backs(name):
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.005, patience=100)
model_checkpoint = ModelCheckpoint(model_path + name + '_epoch{epoch:02d}_loss{val_loss:.4f}.h5',
monitor='val_loss',
verbose=1,
save_best_only=True,
save_weights_only=False,
mode='min',
period=1)
return [model_checkpoint, early_stopping]
Deep Neural Networks
# Function to build Deep NN
def Build_Model_DNN_Text(shape, nClasses, dropout=0.3):
"""
buildModel_DNN_Tex(shape, nClasses,dropout)
Build Deep neural networks Model for text classification
Shape is input feature space
nClasses is number of classes
"""
model = Sequential()
node = 512 # number of nodes
nLayers = 4 # number of hidden layer
model.add(Dense(node,input_dim=shape,activation='relu'))
model.add(Dropout(dropout))
for i in range(0,nLayers):
model.add(Dense(node,input_dim=node,activation='relu'))
model.add(Dropout(dropout))
model.add(BatchNormalization())
model.add(Dense(nClasses, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print(model.summary())
return model
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df.Summary)
X_train_tfidf = Tfidf_vect.transform(X_train).toarray()
X_test_tfidf = Tfidf_vect.transform(X_test).toarray()
# Instantiate the network
model_DNN = Build_Model_DNN_Text(X_train_tfidf.shape[1], 75)
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 512) 2560512
dropout (Dropout) (None, 512) 0
dense_1 (Dense) (None, 512) 262656
dropout_1 (Dropout) (None, 512) 0
batch_normalization (BatchN (None, 512) 2048
ormalization)
dense_2 (Dense) (None, 512) 262656
dropout_2 (Dropout) (None, 512) 0
batch_normalization_1 (Batc (None, 512) 2048
hNormalization)
dense_3 (Dense) (None, 512) 262656
dropout_3 (Dropout) (None, 512) 0
batch_normalization_2 (Batc (None, 512) 2048
hNormalization)
dense_4 (Dense) (None, 512) 262656
dropout_4 (Dropout) (None, 512) 0
batch_normalization_3 (Batc (None, 512) 2048
hNormalization)
dense_5 (Dense) (None, 75) 38475
=================================================================
Total params: 3,657,803
Trainable params: 3,653,707
Non-trainable params: 4,096
_________________________________________________________________
None
model_DNN.fit(X_train_tfidf, y_train,
validation_data=(X_test_tfidf, y_test),
callbacks=call_backs("NN"),
epochs=10,
batch_size=128,
verbose=2)
predicted = model_DNN.predict(X_test_tfidf)
WARNING:tensorflow:`period` argument is deprecated. Please use `save_freq` to specify the frequency in number of batches seen. Epoch 1/10 Epoch 1: val_loss improved from inf to 2.98736, saving model to Weights/NN_epoch01_loss2.9874.h5 54/54 - 5s - loss: 3.0248 - accuracy: 0.3904 - val_loss: 2.9874 - val_accuracy: 0.4476 - 5s/epoch - 90ms/step Epoch 2/10 Epoch 2: val_loss improved from 2.98736 to 2.77127, saving model to Weights/NN_epoch02_loss2.7713.h5 54/54 - 1s - loss: 1.8502 - accuracy: 0.5832 - val_loss: 2.7713 - val_accuracy: 0.4476 - 554ms/epoch - 10ms/step Epoch 3/10 Epoch 3: val_loss did not improve from 2.77127 54/54 - 0s - loss: 1.5298 - accuracy: 0.6285 - val_loss: 2.7997 - val_accuracy: 0.4476 - 399ms/epoch - 7ms/step Epoch 4/10 Epoch 4: val_loss improved from 2.77127 to 2.76191, saving model to Weights/NN_epoch04_loss2.7619.h5 54/54 - 1s - loss: 1.2527 - accuracy: 0.6809 - val_loss: 2.7619 - val_accuracy: 0.4476 - 544ms/epoch - 10ms/step Epoch 5/10 Epoch 5: val_loss improved from 2.76191 to 2.72568, saving model to Weights/NN_epoch05_loss2.7257.h5 54/54 - 1s - loss: 1.0103 - accuracy: 0.7363 - val_loss: 2.7257 - val_accuracy: 0.4482 - 561ms/epoch - 10ms/step Epoch 6/10 Epoch 6: val_loss improved from 2.72568 to 2.44746, saving model to Weights/NN_epoch06_loss2.4475.h5 54/54 - 1s - loss: 0.8245 - accuracy: 0.7797 - val_loss: 2.4475 - val_accuracy: 0.4594 - 549ms/epoch - 10ms/step Epoch 7/10 Epoch 7: val_loss did not improve from 2.44746 54/54 - 0s - loss: 0.6992 - accuracy: 0.8094 - val_loss: 2.5148 - val_accuracy: 0.4841 - 423ms/epoch - 8ms/step Epoch 8/10 Epoch 8: val_loss improved from 2.44746 to 2.05476, saving model to Weights/NN_epoch08_loss2.0548.h5 54/54 - 1s - loss: 0.5992 - accuracy: 0.8359 - val_loss: 2.0548 - val_accuracy: 0.5918 - 537ms/epoch - 10ms/step Epoch 9/10 Epoch 9: val_loss did not improve from 2.05476 54/54 - 0s - loss: 0.5018 - accuracy: 0.8635 - val_loss: 2.0972 - val_accuracy: 0.6071 - 413ms/epoch - 8ms/step Epoch 10/10 Epoch 10: val_loss improved from 2.05476 to 1.94825, saving model to Weights/NN_epoch10_loss1.9482.h5 54/54 - 1s - loss: 0.4388 - accuracy: 0.8741 - val_loss: 1.9482 - val_accuracy: 0.6347 - 541ms/epoch - 10ms/step
pickle.dump(model_DNN, open('DNNmodel.pkl','wb'))
INFO:tensorflow:Assets written to: ram://4adea98c-5dea-4b17-92eb-bb8325ea7d8d/assets
Extract Glove Embeddings
gloveFileName = '/content/drive/MyDrive/Final_Shripad/glove.6B.200d.txt'
MAX_SEQUENCE_LENGTH = 500
EMBEDDING_DIM=200
MAX_NB_WORDS=75000
# Function to generate Embedding
def loadData_Tokenizer(X_train, X_test,filename):
np.random.seed(7)
text = np.concatenate((X_train, X_test), axis=0)
text = np.array(text)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)
word_index = tokenizer.word_index
text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Found %s unique tokens.' % len(word_index))
indices = np.arange(text.shape[0])
# np.random.shuffle(indices)
text = text[indices]
print(text.shape)
X_train = text[0:len(X_train), ]
X_test = text[len(X_train):, ]
embeddings_index = {}
f = open(filename, encoding="utf8")
for line in f:
values = line.split()
word = values[0]
try:
coefs = np.asarray(values[1:], dtype='float32')
except:
pass
embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' % len(embeddings_index))
return (X_train, X_test, word_index,embeddings_index)
embedding_matrix = []
def buildEmbed_matrices(word_index,embedding_dim):
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
if len(embedding_matrix[i]) !=len(embedding_vector):
print("could not broadcast input array from shape",str(len(embedding_matrix[i])), "into shape",str(len(embedding_vector)),
" Please make sure your"" EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
exit(1)
embedding_matrix[i] = embedding_vector
return embedding_matrix
# Generate Glove embedded datasets
X_train_Glove, X_test_Glove, word_index, embeddings_index = loadData_Tokenizer(X_train,X_test,gloveFileName)
embedding_matrix = buildEmbed_matrices(word_index,EMBEDDING_DIM)
Found 18630 unique tokens. (8500, 500) Total 400000 word vectors.
def Build_Model_CNN_Text(word_index, embeddings_matrix, nclasses,dropout=0.5):
"""
def buildModel_CNN(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
word_index in word index ,
embeddings_index is embeddings index, look at data_helper.py
nClasses is number of classes,
MAX_SEQUENCE_LENGTH is maximum lenght of text sequences,
EMBEDDING_DIM is an int value for dimention of word embedding look at data_helper.py
"""
model = Sequential()
embedding_layer = Embedding(len(word_index) + 1,
EMBEDDING_DIM,
weights=[embeddings_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=True)
# applying a more complex convolutional approach
convs = []
filter_sizes = []
layer = 5
print("Filter ",layer)
for fl in range(0,layer):
filter_sizes.append((fl+2))
node = 128
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
for fsz in filter_sizes:
l_conv = Conv1D(node, kernel_size=fsz, activation='relu')(embedded_sequences)
l_pool = MaxPooling1D(5)(l_conv)
#l_pool = Dropout(0.25)(l_pool)
convs.append(l_pool)
l_merge = Concatenate(axis=1)(convs)
l_cov1 = Conv1D(node, 5, activation='relu')(l_merge)
l_cov1 = Dropout(dropout)(l_cov1)
l_batch1 = BatchNormalization()(l_cov1)
l_pool1 = MaxPooling1D(5)(l_batch1)
l_cov2 = Conv1D(node, 5, activation='relu')(l_pool1)
l_cov2 = Dropout(dropout)(l_cov2)
l_batch2 = BatchNormalization()(l_cov2)
l_pool2 = MaxPooling1D(30)(l_batch2)
l_flat = Flatten()(l_pool2)
l_dense = Dense(1024, activation='relu')(l_flat)
l_dense = Dropout(dropout)(l_dense)
l_dense = Dense(512, activation='relu')(l_dense)
l_dense = Dropout(dropout)(l_dense)
preds = Dense(nclasses, activation='softmax')(l_dense)
model = Model(sequence_input, preds)
model.compile(loss='sparse_categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print(model.summary())
return model
# Train the network and run classification
model_CNN = Build_Model_CNN_Text(word_index,embedding_matrix, 75)
run_classification(model_CNN, X_train_Glove, X_test_Glove, y_train, y_test,pipelineRequired = False,isDeepModel=True, arch_name='CNN')
Filter 5
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 500)] 0 []
embedding (Embedding) (None, 500, 200) 3726200 ['input_1[0][0]']
conv1d (Conv1D) (None, 499, 128) 51328 ['embedding[0][0]']
conv1d_1 (Conv1D) (None, 498, 128) 76928 ['embedding[0][0]']
conv1d_2 (Conv1D) (None, 497, 128) 102528 ['embedding[0][0]']
conv1d_3 (Conv1D) (None, 496, 128) 128128 ['embedding[0][0]']
conv1d_4 (Conv1D) (None, 495, 128) 153728 ['embedding[0][0]']
max_pooling1d (MaxPooling1D) (None, 99, 128) 0 ['conv1d[0][0]']
max_pooling1d_1 (MaxPooling1D) (None, 99, 128) 0 ['conv1d_1[0][0]']
max_pooling1d_2 (MaxPooling1D) (None, 99, 128) 0 ['conv1d_2[0][0]']
max_pooling1d_3 (MaxPooling1D) (None, 99, 128) 0 ['conv1d_3[0][0]']
max_pooling1d_4 (MaxPooling1D) (None, 99, 128) 0 ['conv1d_4[0][0]']
concatenate (Concatenate) (None, 495, 128) 0 ['max_pooling1d[0][0]',
'max_pooling1d_1[0][0]',
'max_pooling1d_2[0][0]',
'max_pooling1d_3[0][0]',
'max_pooling1d_4[0][0]']
conv1d_5 (Conv1D) (None, 491, 128) 82048 ['concatenate[0][0]']
dropout_5 (Dropout) (None, 491, 128) 0 ['conv1d_5[0][0]']
batch_normalization_4 (BatchNo (None, 491, 128) 512 ['dropout_5[0][0]']
rmalization)
max_pooling1d_5 (MaxPooling1D) (None, 98, 128) 0 ['batch_normalization_4[0][0]']
conv1d_6 (Conv1D) (None, 94, 128) 82048 ['max_pooling1d_5[0][0]']
dropout_6 (Dropout) (None, 94, 128) 0 ['conv1d_6[0][0]']
batch_normalization_5 (BatchNo (None, 94, 128) 512 ['dropout_6[0][0]']
rmalization)
max_pooling1d_6 (MaxPooling1D) (None, 3, 128) 0 ['batch_normalization_5[0][0]']
flatten (Flatten) (None, 384) 0 ['max_pooling1d_6[0][0]']
dense_6 (Dense) (None, 1024) 394240 ['flatten[0][0]']
dropout_7 (Dropout) (None, 1024) 0 ['dense_6[0][0]']
dense_7 (Dense) (None, 512) 524800 ['dropout_7[0][0]']
dropout_8 (Dropout) (None, 512) 0 ['dense_7[0][0]']
dense_8 (Dense) (None, 75) 38475 ['dropout_8[0][0]']
==================================================================================================
Total params: 5,361,475
Trainable params: 5,360,963
Non-trainable params: 512
__________________________________________________________________________________________________
None
WARNING:tensorflow:`period` argument is deprecated. Please use `save_freq` to specify the frequency in number of batches seen.
Epoch 1/10
54/54 [==============================] - ETA: 0s - loss: 3.0708 - accuracy: 0.4404
Epoch 1: val_loss improved from inf to 3.13401, saving model to Weights/CNN_epoch01_loss3.1340.h5
54/54 [==============================] - 19s 129ms/step - loss: 3.0708 - accuracy: 0.4404 - val_loss: 3.1340 - val_accuracy: 0.4476
Epoch 2/10
53/54 [============================>.] - ETA: 0s - loss: 2.5724 - accuracy: 0.4685
Epoch 2: val_loss did not improve from 3.13401
54/54 [==============================] - 5s 102ms/step - loss: 2.5724 - accuracy: 0.4687 - val_loss: 3.7322 - val_accuracy: 0.4476
Epoch 3/10
53/54 [============================>.] - ETA: 0s - loss: 2.4685 - accuracy: 0.4744
Epoch 3: val_loss did not improve from 3.13401
54/54 [==============================] - 6s 102ms/step - loss: 2.4677 - accuracy: 0.4746 - val_loss: 3.7270 - val_accuracy: 0.4918
Epoch 4/10
53/54 [============================>.] - ETA: 0s - loss: 2.2751 - accuracy: 0.5228
Epoch 4: val_loss did not improve from 3.13401
54/54 [==============================] - 6s 103ms/step - loss: 2.2739 - accuracy: 0.5229 - val_loss: 3.3972 - val_accuracy: 0.5065
Epoch 5/10
53/54 [============================>.] - ETA: 0s - loss: 2.1452 - accuracy: 0.5401
Epoch 5: val_loss improved from 3.13401 to 3.04753, saving model to Weights/CNN_epoch05_loss3.0475.h5
54/54 [==============================] - 6s 106ms/step - loss: 2.1457 - accuracy: 0.5401 - val_loss: 3.0475 - val_accuracy: 0.4594
Epoch 6/10
53/54 [============================>.] - ETA: 0s - loss: 2.0370 - accuracy: 0.5457
Epoch 6: val_loss improved from 3.04753 to 2.87801, saving model to Weights/CNN_epoch06_loss2.8780.h5
54/54 [==============================] - 6s 112ms/step - loss: 2.0371 - accuracy: 0.5456 - val_loss: 2.8780 - val_accuracy: 0.4488
Epoch 7/10
53/54 [============================>.] - ETA: 0s - loss: 1.9657 - accuracy: 0.5540
Epoch 7: val_loss improved from 2.87801 to 2.77804, saving model to Weights/CNN_epoch07_loss2.7780.h5
54/54 [==============================] - 6s 107ms/step - loss: 1.9651 - accuracy: 0.5541 - val_loss: 2.7780 - val_accuracy: 0.4488
Epoch 8/10
53/54 [============================>.] - ETA: 0s - loss: 1.8704 - accuracy: 0.5650
Epoch 8: val_loss improved from 2.77804 to 2.62549, saving model to Weights/CNN_epoch08_loss2.6255.h5
54/54 [==============================] - 6s 108ms/step - loss: 1.8715 - accuracy: 0.5646 - val_loss: 2.6255 - val_accuracy: 0.4506
Epoch 9/10
53/54 [============================>.] - ETA: 0s - loss: 1.7583 - accuracy: 0.5749
Epoch 9: val_loss improved from 2.62549 to 2.55750, saving model to Weights/CNN_epoch09_loss2.5575.h5
54/54 [==============================] - 6s 111ms/step - loss: 1.7583 - accuracy: 0.5749 - val_loss: 2.5575 - val_accuracy: 0.4494
Epoch 10/10
53/54 [============================>.] - ETA: 0s - loss: 1.7060 - accuracy: 0.5812
Epoch 10: val_loss improved from 2.55750 to 2.40942, saving model to Weights/CNN_epoch10_loss2.4094.h5
54/54 [==============================] - 6s 109ms/step - loss: 1.7053 - accuracy: 0.5815 - val_loss: 2.4094 - val_accuracy: 0.4512
Estimator: <keras.engine.functional.Functional object at 0x7fb2ec62f210>
================================================================================
Training accuracy: 47.69%
Testing accuracy: 45.12%
================================================================================
Confusion matrix:
[[760 0 0 ... 0 0 0]
[ 6 0 0 ... 0 0 0]
[ 14 0 0 ... 0 0 0]
...
[ 1 0 0 ... 0 0 0]
[ 20 0 0 ... 0 0 0]
[ 17 0 0 ... 0 0 0]]
================================================================================
Classification report:
precision recall f1-score support
0 0.52 1.00 0.68 761
1 0.00 0.00 0.00 8
2 0.00 0.00 0.00 24
3 0.00 0.00 0.00 5
4 0.01 0.05 0.01 42
5 0.00 0.00 0.00 26
6 0.00 0.00 0.00 20
7 0.00 0.00 0.00 8
8 0.00 0.00 0.00 20
9 0.00 0.00 0.00 17
10 0.00 0.00 0.00 18
11 0.00 0.00 0.00 58
12 0.56 0.10 0.17 51
13 0.00 0.00 0.00 5
14 0.00 0.00 0.00 7
15 0.00 0.00 0.00 5
16 0.00 0.00 0.00 3
17 0.00 0.00 0.00 72
18 0.00 0.00 0.00 20
19 0.00 0.00 0.00 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 0.00 0.00 0.00 20
23 0.00 0.00 0.00 42
24 0.00 0.00 0.00 6
25 0.00 0.00 0.00 24
27 0.00 0.00 0.00 14
28 0.00 0.00 0.00 12
29 0.00 0.00 0.00 1
30 0.00 0.00 0.00 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 0.00 0.00 0.00 6
34 0.00 0.00 0.00 26
35 0.00 0.00 0.00 8
36 0.00 0.00 0.00 10
37 0.00 0.00 0.00 8
39 0.00 0.00 0.00 1
40 0.00 0.00 0.00 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 0.00 0.00 0.00 6
45 0.00 0.00 0.00 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 0.00 0.00 0.00 1
56 0.00 0.00 0.00 46
57 0.00 0.00 0.00 5
59 0.00 0.00 0.00 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.00 0.00 0.00 18
70 0.00 0.00 0.00 1
72 0.00 0.00 0.00 121
73 0.00 0.00 0.00 56
accuracy 0.45 1700
macro avg 0.02 0.02 0.01 1700
weighted avg 0.25 0.45 0.31 1700
Time taken: 74.848827s to run the model
pickle.dump(model_CNN, open('CNNmodel.pkl','wb'))
INFO:tensorflow:Assets written to: ram://7bdc1806-e874-49f3-abe6-e23af5695ac3/assets
Recurrent Neural Networks (RNN)
def Build_Model_RNN_Text(word_index, embeddings_matrix, nclasses,dropout=0.5):
"""
def buildModel_RNN(word_index, embeddings_matrix, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=100, dropout=0.5):
word_index in word index ,
embeddings_matrix is embeddings_matrix, look at data_helper.py
nClasses is number of classes,
MAX_SEQUENCE_LENGTH is maximum lenght of text sequences
"""
model = Sequential()
hidden_layer = 3
gru_node = 32
model.add(Embedding(len(word_index) + 1,
EMBEDDING_DIM,
weights=[embeddings_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=True))
print(gru_node)
for i in range(0,hidden_layer):
model.add(GRU(gru_node,return_sequences=True, recurrent_dropout=0.2))
model.add(Dropout(dropout))
model.add(BatchNormalization())
model.add(GRU(gru_node, recurrent_dropout=0.2))
model.add(Dropout(dropout))
model.add(BatchNormalization())
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(nclasses, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',
optimizer='sgd',
metrics=['accuracy'])
print(model.summary())
return model
# Train the network and run classification
model_RNN = Build_Model_RNN_Text(word_index,embedding_matrix, 75)
run_classification(model_RNN, X_train_Glove, X_test_Glove, y_train, y_test,pipelineRequired = False,isDeepModel=True, arch_name='RNN')
32
WARNING:tensorflow:Layer gru will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer gru_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer gru_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer gru_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
Model: "sequential_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, 500, 200) 3726200
gru (GRU) (None, 500, 32) 22464
dropout_9 (Dropout) (None, 500, 32) 0
batch_normalization_6 (Batc (None, 500, 32) 128
hNormalization)
gru_1 (GRU) (None, 500, 32) 6336
dropout_10 (Dropout) (None, 500, 32) 0
batch_normalization_7 (Batc (None, 500, 32) 128
hNormalization)
gru_2 (GRU) (None, 500, 32) 6336
dropout_11 (Dropout) (None, 500, 32) 0
batch_normalization_8 (Batc (None, 500, 32) 128
hNormalization)
gru_3 (GRU) (None, 32) 6336
dropout_12 (Dropout) (None, 32) 0
batch_normalization_9 (Batc (None, 32) 128
hNormalization)
dense_9 (Dense) (None, 256) 8448
batch_normalization_10 (Bat (None, 256) 1024
chNormalization)
dense_10 (Dense) (None, 75) 19275
=================================================================
Total params: 3,796,931
Trainable params: 3,796,163
Non-trainable params: 768
_________________________________________________________________
None
WARNING:tensorflow:`period` argument is deprecated. Please use `save_freq` to specify the frequency in number of batches seen.
Epoch 1/10
54/54 [==============================] - ETA: 0s - loss: 4.6477 - accuracy: 0.0665
Epoch 1: val_loss improved from inf to 3.94416, saving model to Weights/RNN_epoch01_loss3.9442.h5
54/54 [==============================] - 338s 6s/step - loss: 4.6477 - accuracy: 0.0665 - val_loss: 3.9442 - val_accuracy: 0.4429
Epoch 2/10
54/54 [==============================] - ETA: 0s - loss: 3.8516 - accuracy: 0.2863
Epoch 2: val_loss improved from 3.94416 to 3.28865, saving model to Weights/RNN_epoch02_loss3.2886.h5
54/54 [==============================] - 310s 6s/step - loss: 3.8516 - accuracy: 0.2863 - val_loss: 3.2886 - val_accuracy: 0.4918
Epoch 3/10
54/54 [==============================] - ETA: 0s - loss: 3.3395 - accuracy: 0.4028
Epoch 3: val_loss improved from 3.28865 to 2.86878, saving model to Weights/RNN_epoch03_loss2.8688.h5
54/54 [==============================] - 311s 6s/step - loss: 3.3395 - accuracy: 0.4028 - val_loss: 2.8688 - val_accuracy: 0.5041
Epoch 4/10
54/54 [==============================] - ETA: 0s - loss: 2.9774 - accuracy: 0.4665
Epoch 4: val_loss improved from 2.86878 to 2.66555, saving model to Weights/RNN_epoch04_loss2.6656.h5
54/54 [==============================] - 311s 6s/step - loss: 2.9774 - accuracy: 0.4665 - val_loss: 2.6656 - val_accuracy: 0.5076
Epoch 5/10
54/54 [==============================] - ETA: 0s - loss: 2.7753 - accuracy: 0.5031
Epoch 5: val_loss improved from 2.66555 to 2.50902, saving model to Weights/RNN_epoch05_loss2.5090.h5
54/54 [==============================] - 324s 6s/step - loss: 2.7753 - accuracy: 0.5031 - val_loss: 2.5090 - val_accuracy: 0.5194
Epoch 6/10
54/54 [==============================] - ETA: 0s - loss: 2.6335 - accuracy: 0.5200
Epoch 6: val_loss did not improve from 2.50902
54/54 [==============================] - 358s 7s/step - loss: 2.6335 - accuracy: 0.5200 - val_loss: 2.5332 - val_accuracy: 0.5124
Epoch 7/10
54/54 [==============================] - ETA: 0s - loss: 2.5442 - accuracy: 0.5300
Epoch 7: val_loss improved from 2.50902 to 2.41010, saving model to Weights/RNN_epoch07_loss2.4101.h5
54/54 [==============================] - 337s 6s/step - loss: 2.5442 - accuracy: 0.5300 - val_loss: 2.4101 - val_accuracy: 0.5241
Epoch 8/10
54/54 [==============================] - ETA: 0s - loss: 2.4866 - accuracy: 0.5316
Epoch 8: val_loss improved from 2.41010 to 2.40547, saving model to Weights/RNN_epoch08_loss2.4055.h5
54/54 [==============================] - 309s 6s/step - loss: 2.4866 - accuracy: 0.5316 - val_loss: 2.4055 - val_accuracy: 0.5259
Epoch 9/10
54/54 [==============================] - ETA: 0s - loss: 2.4136 - accuracy: 0.5372
Epoch 9: val_loss improved from 2.40547 to 2.34145, saving model to Weights/RNN_epoch09_loss2.3415.h5
54/54 [==============================] - 313s 6s/step - loss: 2.4136 - accuracy: 0.5372 - val_loss: 2.3415 - val_accuracy: 0.5288
Epoch 10/10
54/54 [==============================] - ETA: 0s - loss: 2.3825 - accuracy: 0.5375
Epoch 10: val_loss did not improve from 2.34145
54/54 [==============================] - 311s 6s/step - loss: 2.3825 - accuracy: 0.5375 - val_loss: 2.3511 - val_accuracy: 0.5353
Estimator: <keras.engine.sequential.Sequential object at 0x7fb27a349d50>
================================================================================
Training accuracy: 55.75%
Testing accuracy: 53.53%
================================================================================
Confusion matrix:
[[744 0 0 ... 0 1 0]
[ 4 0 0 ... 0 3 0]
[ 15 0 0 ... 0 9 0]
...
[ 1 0 0 ... 0 0 0]
[ 14 0 0 ... 0 107 0]
[ 17 0 0 ... 0 38 0]]
================================================================================
Classification report:
precision recall f1-score support
0 0.55 0.98 0.70 761
1 0.00 0.00 0.00 8
2 0.00 0.00 0.00 24
3 0.00 0.00 0.00 5
4 1.00 0.02 0.05 42
5 0.25 0.04 0.07 26
6 0.00 0.00 0.00 20
7 0.00 0.00 0.00 8
8 0.00 0.00 0.00 20
9 0.00 0.00 0.00 17
10 0.00 0.00 0.00 18
11 0.00 0.00 0.00 58
12 0.00 0.00 0.00 51
13 0.00 0.00 0.00 5
14 0.00 0.00 0.00 7
15 0.00 0.00 0.00 5
16 0.00 0.00 0.00 3
17 0.51 0.79 0.62 72
18 0.00 0.00 0.00 20
19 0.00 0.00 0.00 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 0.00 0.00 0.00 20
23 0.00 0.00 0.00 42
24 0.00 0.00 0.00 6
25 0.00 0.00 0.00 24
27 0.00 0.00 0.00 14
28 0.00 0.00 0.00 12
29 0.00 0.00 0.00 1
30 0.00 0.00 0.00 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 0.00 0.00 0.00 6
34 0.00 0.00 0.00 26
35 0.00 0.00 0.00 8
36 0.00 0.00 0.00 10
37 0.00 0.00 0.00 8
39 0.00 0.00 0.00 1
40 0.00 0.00 0.00 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 0.00 0.00 0.00 6
45 0.00 0.00 0.00 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 0.00 0.00 0.00 1
56 0.00 0.00 0.00 46
57 0.00 0.00 0.00 5
59 0.00 0.00 0.00 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.00 0.00 0.00 18
70 0.00 0.00 0.00 1
72 0.47 0.88 0.61 121
73 0.00 0.00 0.00 56
accuracy 0.54 1700
macro avg 0.05 0.05 0.03 1700
weighted avg 0.33 0.54 0.39 1700
Time taken: 3390.655521s to run the model
pickle.dump(model_RNN, open('RNNmodel.pkl','wb'))
INFO:tensorflow:Assets written to: ram://fe2656c2-7b16-4434-8d5d-5fbec09c838a/assets
WARNING:absl:<keras.layers.recurrent.GRUCell object at 0x7fb27a4bc050> has the same name 'GRUCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.GRUCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.GRUCell object at 0x7fb2f611c090> has the same name 'GRUCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.GRUCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.GRUCell object at 0x7fb27a465550> has the same name 'GRUCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.GRUCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.GRUCell object at 0x7fb2f5c1ad50> has the same name 'GRUCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.GRUCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function.
Recurring Convolution Neural Network(RCNN)
def Build_Model_RCNN_Text(word_index, embeddings_matrix, nclasses):
kernel_size = 2
filters = 256
pool_size = 2
gru_node = 256
model = Sequential()
model.add(Embedding(len(word_index) + 1,
EMBEDDING_DIM,
weights=[embeddings_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=True))
model.add(Dropout(0.25))
model.add(BatchNormalization())
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
model.add(Dropout(0.25))
model.add(BatchNormalization())
model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
model.add(Dropout(0.25))
model.add(BatchNormalization())
model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
model.add(Dropout(0.25))
model.add(BatchNormalization())
model.add(LSTM(gru_node, recurrent_dropout=0.2))
model.add(Dropout(0.25))
model.add(BatchNormalization())
model.add(Dense(1024,activation='relu'))
model.add(Dense(nclasses))
model.add(Activation('softmax'))
model.compile(loss='sparse_categorical_crossentropy',
optimizer='sgd',
metrics=['accuracy'])
print(model.summary())
return model
# Train the network and run classification
model_RCNN = Build_Model_CNN_Text(word_index,embedding_matrix, 75)
run_classification(model_RCNN, X_train_Glove, X_test_Glove, y_train, y_test,pipelineRequired = False,isDeepModel=True, arch_name='RCNN')
Filter 5
Model: "model_1"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_2 (InputLayer) [(None, 500)] 0 []
embedding_2 (Embedding) (None, 500, 200) 3726200 ['input_2[0][0]']
conv1d_7 (Conv1D) (None, 499, 128) 51328 ['embedding_2[0][0]']
conv1d_8 (Conv1D) (None, 498, 128) 76928 ['embedding_2[0][0]']
conv1d_9 (Conv1D) (None, 497, 128) 102528 ['embedding_2[0][0]']
conv1d_10 (Conv1D) (None, 496, 128) 128128 ['embedding_2[0][0]']
conv1d_11 (Conv1D) (None, 495, 128) 153728 ['embedding_2[0][0]']
max_pooling1d_7 (MaxPooling1D) (None, 99, 128) 0 ['conv1d_7[0][0]']
max_pooling1d_8 (MaxPooling1D) (None, 99, 128) 0 ['conv1d_8[0][0]']
max_pooling1d_9 (MaxPooling1D) (None, 99, 128) 0 ['conv1d_9[0][0]']
max_pooling1d_10 (MaxPooling1D (None, 99, 128) 0 ['conv1d_10[0][0]']
)
max_pooling1d_11 (MaxPooling1D (None, 99, 128) 0 ['conv1d_11[0][0]']
)
concatenate_1 (Concatenate) (None, 495, 128) 0 ['max_pooling1d_7[0][0]',
'max_pooling1d_8[0][0]',
'max_pooling1d_9[0][0]',
'max_pooling1d_10[0][0]',
'max_pooling1d_11[0][0]']
conv1d_12 (Conv1D) (None, 491, 128) 82048 ['concatenate_1[0][0]']
dropout_13 (Dropout) (None, 491, 128) 0 ['conv1d_12[0][0]']
batch_normalization_11 (BatchN (None, 491, 128) 512 ['dropout_13[0][0]']
ormalization)
max_pooling1d_12 (MaxPooling1D (None, 98, 128) 0 ['batch_normalization_11[0][0]']
)
conv1d_13 (Conv1D) (None, 94, 128) 82048 ['max_pooling1d_12[0][0]']
dropout_14 (Dropout) (None, 94, 128) 0 ['conv1d_13[0][0]']
batch_normalization_12 (BatchN (None, 94, 128) 512 ['dropout_14[0][0]']
ormalization)
max_pooling1d_13 (MaxPooling1D (None, 3, 128) 0 ['batch_normalization_12[0][0]']
)
flatten_1 (Flatten) (None, 384) 0 ['max_pooling1d_13[0][0]']
dense_11 (Dense) (None, 1024) 394240 ['flatten_1[0][0]']
dropout_15 (Dropout) (None, 1024) 0 ['dense_11[0][0]']
dense_12 (Dense) (None, 512) 524800 ['dropout_15[0][0]']
dropout_16 (Dropout) (None, 512) 0 ['dense_12[0][0]']
dense_13 (Dense) (None, 75) 38475 ['dropout_16[0][0]']
==================================================================================================
Total params: 5,361,475
Trainable params: 5,360,963
Non-trainable params: 512
__________________________________________________________________________________________________
None
WARNING:tensorflow:`period` argument is deprecated. Please use `save_freq` to specify the frequency in number of batches seen.
WARNING:tensorflow:`period` argument is deprecated. Please use `save_freq` to specify the frequency in number of batches seen.
Epoch 1/10
53/54 [============================>.] - ETA: 0s - loss: 3.1633 - accuracy: 0.4307
Epoch 1: val_loss improved from inf to 3.06498, saving model to Weights/RCNN_epoch01_loss3.0650.h5
54/54 [==============================] - 7s 111ms/step - loss: 3.1627 - accuracy: 0.4306 - val_loss: 3.0650 - val_accuracy: 0.4476
Epoch 2/10
53/54 [============================>.] - ETA: 0s - loss: 2.5887 - accuracy: 0.4736
Epoch 2: val_loss did not improve from 3.06498
54/54 [==============================] - 6s 109ms/step - loss: 2.5912 - accuracy: 0.4732 - val_loss: 3.7624 - val_accuracy: 0.4476
Epoch 3/10
53/54 [============================>.] - ETA: 0s - loss: 2.5213 - accuracy: 0.4730
Epoch 3: val_loss did not improve from 3.06498
54/54 [==============================] - 6s 104ms/step - loss: 2.5205 - accuracy: 0.4731 - val_loss: 3.5724 - val_accuracy: 0.4471
Epoch 4/10
53/54 [============================>.] - ETA: 0s - loss: 2.3584 - accuracy: 0.4956
Epoch 4: val_loss did not improve from 3.06498
54/54 [==============================] - 6s 104ms/step - loss: 2.3591 - accuracy: 0.4954 - val_loss: 3.3021 - val_accuracy: 0.4471
Epoch 5/10
53/54 [============================>.] - ETA: 0s - loss: 2.2103 - accuracy: 0.5321
Epoch 5: val_loss did not improve from 3.06498
54/54 [==============================] - 6s 105ms/step - loss: 2.2100 - accuracy: 0.5321 - val_loss: 3.0706 - val_accuracy: 0.5059
Epoch 6/10
53/54 [============================>.] - ETA: 0s - loss: 2.0665 - accuracy: 0.5429
Epoch 6: val_loss improved from 3.06498 to 3.06361, saving model to Weights/RCNN_epoch06_loss3.0636.h5
54/54 [==============================] - 6s 109ms/step - loss: 2.0659 - accuracy: 0.5431 - val_loss: 3.0636 - val_accuracy: 0.5112
Epoch 7/10
53/54 [============================>.] - ETA: 0s - loss: 1.9312 - accuracy: 0.5482
Epoch 7: val_loss improved from 3.06361 to 2.89996, saving model to Weights/RCNN_epoch07_loss2.9000.h5
54/54 [==============================] - 6s 112ms/step - loss: 1.9312 - accuracy: 0.5482 - val_loss: 2.9000 - val_accuracy: 0.5053
Epoch 8/10
53/54 [============================>.] - ETA: 0s - loss: 1.8566 - accuracy: 0.5575
Epoch 8: val_loss did not improve from 2.89996
54/54 [==============================] - 6s 114ms/step - loss: 1.8563 - accuracy: 0.5576 - val_loss: 2.9997 - val_accuracy: 0.4859
Epoch 9/10
53/54 [============================>.] - ETA: 0s - loss: 1.7726 - accuracy: 0.5696
Epoch 9: val_loss improved from 2.89996 to 2.85923, saving model to Weights/RCNN_epoch09_loss2.8592.h5
54/54 [==============================] - 6s 119ms/step - loss: 1.7727 - accuracy: 0.5696 - val_loss: 2.8592 - val_accuracy: 0.4794
Epoch 10/10
53/54 [============================>.] - ETA: 0s - loss: 1.7348 - accuracy: 0.5744
Epoch 10: val_loss improved from 2.85923 to 2.39275, saving model to Weights/RCNN_epoch10_loss2.3927.h5
54/54 [==============================] - 6s 119ms/step - loss: 1.7342 - accuracy: 0.5747 - val_loss: 2.3927 - val_accuracy: 0.4994
Estimator: <keras.engine.functional.Functional object at 0x7fb0a0f79b50>
================================================================================
Training accuracy: 56.72%
Testing accuracy: 49.94%
================================================================================
Confusion matrix:
[[715 0 0 ... 0 0 0]
[ 0 0 0 ... 0 2 0]
[ 7 0 0 ... 0 9 0]
...
[ 1 0 0 ... 0 0 0]
[ 1 0 0 ... 0 96 0]
[ 10 0 0 ... 0 38 0]]
================================================================================
Classification report:
precision recall f1-score support
0 0.70 0.94 0.80 761
1 0.00 0.00 0.00 8
2 0.00 0.00 0.00 24
3 0.00 0.00 0.00 5
4 0.07 0.64 0.13 42
5 0.00 0.00 0.00 26
6 0.00 0.00 0.00 20
7 0.00 0.00 0.00 8
8 0.00 0.00 0.00 20
9 0.00 0.00 0.00 17
10 0.00 0.00 0.00 18
11 0.00 0.00 0.00 58
12 0.11 0.22 0.14 51
13 0.00 0.00 0.00 5
14 0.00 0.00 0.00 7
15 0.00 0.00 0.00 5
16 0.00 0.00 0.00 3
17 0.00 0.00 0.00 72
18 0.00 0.00 0.00 20
19 0.00 0.00 0.00 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 0.00 0.00 0.00 20
23 0.00 0.00 0.00 42
24 0.00 0.00 0.00 6
25 0.00 0.00 0.00 24
27 0.00 0.00 0.00 14
28 0.00 0.00 0.00 12
29 0.00 0.00 0.00 1
30 0.00 0.00 0.00 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 0.00 0.00 0.00 6
34 0.00 0.00 0.00 26
35 0.00 0.00 0.00 8
36 0.00 0.00 0.00 10
37 0.00 0.00 0.00 8
39 0.00 0.00 0.00 1
40 0.00 0.00 0.00 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 0.00 0.00 0.00 6
45 0.00 0.00 0.00 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 0.00 0.00 0.00 1
56 0.00 0.00 0.00 46
57 0.00 0.00 0.00 5
59 0.00 0.00 0.00 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.00 0.00 0.00 18
70 0.00 0.00 0.00 1
72 0.47 0.79 0.59 121
73 0.00 0.00 0.00 56
accuracy 0.50 1700
macro avg 0.02 0.04 0.03 1700
weighted avg 0.35 0.50 0.41 1700
Time taken: 86.775323s to run the model
pickle.dump(model_RCNN, open('RCNNmodel.pkl','wb'))
INFO:tensorflow:Assets written to: ram://12c05a3e-65e0-4686-bb5c-e518699713aa/assets
INFO:tensorflow:Assets written to: ram://12c05a3e-65e0-4686-bb5c-e518699713aa/assets
RNN with LSTM networks
EMBEDDING_DIM = 100
gloveFileName = '/content/drive/MyDrive/Final_Shripad/glove.6B.100d.txt'
from keras.models import Sequential
from keras.layers import Dense, LSTM, TimeDistributed, Activation
from keras.layers import Flatten, Permute, merge, Input
from keras.layers import Embedding
from keras.models import Model
from keras.layers import Input, Dense, multiply, concatenate, Dropout
from keras.layers import GRU, Bidirectional
def Build_Model_LTSM_Text(word_index, embeddings_matrix, nclasses):
kernel_size = 2
filters = 256
pool_size = 2
gru_node = 256
model = Sequential()
model.add(Embedding(len(word_index) + 1,
EMBEDDING_DIM,
weights=[embeddings_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=True))
model.add(Dropout(0.25))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(Bidirectional(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2)))
model.add(Bidirectional(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2)))
model.add(Bidirectional(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2)))
model.add(Bidirectional(LSTM(gru_node, recurrent_dropout=0.2)))
model.add(Dense(1024,activation='relu'))
model.add(Dense(nclasses))
model.add(Activation('softmax'))
model.compile(loss='sparse_categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print(model.summary())
return model
X_train_Glove,X_test_Glove, word_index,embeddings_index = loadData_Tokenizer(X_train,X_test,gloveFileName)
embedding_matrix = buildEmbed_matrices(word_index,EMBEDDING_DIM)
model_LTSM = Build_Model_LTSM_Text(word_index,embedding_matrix, 75)
run_classification(model_LTSM, X_train_Glove, X_test_Glove, y_train, y_test,pipelineRequired = False,isDeepModel=True, arch_name='LSTM')
Found 18630 unique tokens. (8500, 500) Total 400000 word vectors. WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
Model: "sequential_4"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_3 (Embedding) (None, 500, 100) 1863100
dropout_17 (Dropout) (None, 500, 100) 0
conv1d_14 (Conv1D) (None, 499, 256) 51456
max_pooling1d_14 (MaxPoolin (None, 249, 256) 0
g1D)
conv1d_15 (Conv1D) (None, 248, 256) 131328
max_pooling1d_15 (MaxPoolin (None, 124, 256) 0
g1D)
conv1d_16 (Conv1D) (None, 123, 256) 131328
max_pooling1d_16 (MaxPoolin (None, 61, 256) 0
g1D)
conv1d_17 (Conv1D) (None, 60, 256) 131328
max_pooling1d_17 (MaxPoolin (None, 30, 256) 0
g1D)
bidirectional (Bidirectiona (None, 30, 512) 1050624
l)
bidirectional_1 (Bidirectio (None, 30, 512) 1574912
nal)
bidirectional_2 (Bidirectio (None, 30, 512) 1574912
nal)
bidirectional_3 (Bidirectio (None, 512) 1574912
nal)
dense_14 (Dense) (None, 1024) 525312
dense_15 (Dense) (None, 75) 76875
activation (Activation) (None, 75) 0
=================================================================
Total params: 8,686,087
Trainable params: 8,686,087
Non-trainable params: 0
_________________________________________________________________
None
WARNING:tensorflow:`period` argument is deprecated. Please use `save_freq` to specify the frequency in number of batches seen.
WARNING:tensorflow:`period` argument is deprecated. Please use `save_freq` to specify the frequency in number of batches seen.
Epoch 1/10
54/54 [==============================] - ETA: 0s - loss: 2.6655 - accuracy: 0.4619
Epoch 1: val_loss improved from inf to 2.63906, saving model to Weights/LSTM_epoch01_loss2.6391.h5
54/54 [==============================] - 73s 1s/step - loss: 2.6655 - accuracy: 0.4619 - val_loss: 2.6391 - val_accuracy: 0.4476
Epoch 2/10
54/54 [==============================] - ETA: 0s - loss: 2.5471 - accuracy: 0.4728
Epoch 2: val_loss improved from 2.63906 to 2.61893, saving model to Weights/LSTM_epoch02_loss2.6189.h5
54/54 [==============================] - 60s 1s/step - loss: 2.5471 - accuracy: 0.4728 - val_loss: 2.6189 - val_accuracy: 0.4476
Epoch 3/10
54/54 [==============================] - ETA: 0s - loss: 2.4863 - accuracy: 0.4728
Epoch 3: val_loss improved from 2.61893 to 2.49154, saving model to Weights/LSTM_epoch03_loss2.4915.h5
54/54 [==============================] - 49s 898ms/step - loss: 2.4863 - accuracy: 0.4728 - val_loss: 2.4915 - val_accuracy: 0.4476
Epoch 4/10
54/54 [==============================] - ETA: 0s - loss: 2.2913 - accuracy: 0.5038
Epoch 4: val_loss improved from 2.49154 to 2.24010, saving model to Weights/LSTM_epoch04_loss2.2401.h5
54/54 [==============================] - 50s 926ms/step - loss: 2.2913 - accuracy: 0.5038 - val_loss: 2.2401 - val_accuracy: 0.5088
Epoch 5/10
54/54 [==============================] - ETA: 0s - loss: 2.0618 - accuracy: 0.5447
Epoch 5: val_loss improved from 2.24010 to 2.16629, saving model to Weights/LSTM_epoch05_loss2.1663.h5
54/54 [==============================] - 49s 909ms/step - loss: 2.0618 - accuracy: 0.5447 - val_loss: 2.1663 - val_accuracy: 0.5088
Epoch 6/10
54/54 [==============================] - ETA: 0s - loss: 1.9983 - accuracy: 0.5482
Epoch 6: val_loss improved from 2.16629 to 2.13873, saving model to Weights/LSTM_epoch06_loss2.1387.h5
54/54 [==============================] - 49s 904ms/step - loss: 1.9983 - accuracy: 0.5482 - val_loss: 2.1387 - val_accuracy: 0.5135
Epoch 7/10
54/54 [==============================] - ETA: 0s - loss: 1.9031 - accuracy: 0.5544
Epoch 7: val_loss improved from 2.13873 to 2.07586, saving model to Weights/LSTM_epoch07_loss2.0759.h5
54/54 [==============================] - 49s 902ms/step - loss: 1.9031 - accuracy: 0.5544 - val_loss: 2.0759 - val_accuracy: 0.5159
Epoch 8/10
54/54 [==============================] - ETA: 0s - loss: 1.8439 - accuracy: 0.5584
Epoch 8: val_loss improved from 2.07586 to 2.05908, saving model to Weights/LSTM_epoch08_loss2.0591.h5
54/54 [==============================] - 48s 889ms/step - loss: 1.8439 - accuracy: 0.5584 - val_loss: 2.0591 - val_accuracy: 0.5165
Epoch 9/10
54/54 [==============================] - ETA: 0s - loss: 1.7922 - accuracy: 0.5626
Epoch 9: val_loss did not improve from 2.05908
54/54 [==============================] - 48s 892ms/step - loss: 1.7922 - accuracy: 0.5626 - val_loss: 2.1126 - val_accuracy: 0.5018
Epoch 10/10
54/54 [==============================] - ETA: 0s - loss: 1.7141 - accuracy: 0.5731
Epoch 10: val_loss did not improve from 2.05908
54/54 [==============================] - 48s 885ms/step - loss: 1.7141 - accuracy: 0.5731 - val_loss: 2.1302 - val_accuracy: 0.5176
Estimator: <keras.engine.sequential.Sequential object at 0x7fb09cd9c510>
================================================================================
Training accuracy: 57.74%
Testing accuracy: 51.76%
================================================================================
Confusion matrix:
[[729 0 0 ... 0 2 0]
[ 0 0 0 ... 0 6 0]
[ 4 0 0 ... 0 12 0]
...
[ 1 0 0 ... 0 0 0]
[ 1 0 0 ... 0 115 0]
[ 11 0 0 ... 0 39 0]]
================================================================================
Classification report:
precision recall f1-score support
0 0.64 0.96 0.77 761
1 0.00 0.00 0.00 8
2 0.00 0.00 0.00 24
3 0.00 0.00 0.00 5
4 0.07 0.29 0.12 42
5 0.13 0.08 0.10 26
6 0.00 0.00 0.00 20
7 0.00 0.00 0.00 8
8 0.00 0.00 0.00 20
9 0.00 0.00 0.00 17
10 0.00 0.00 0.00 18
11 0.00 0.00 0.00 58
12 0.15 0.10 0.12 51
13 0.00 0.00 0.00 5
14 0.00 0.00 0.00 7
15 0.00 0.00 0.00 5
16 0.00 0.00 0.00 3
17 0.53 0.24 0.33 72
18 0.00 0.00 0.00 20
19 0.00 0.00 0.00 13
20 0.00 0.00 0.00 2
21 0.00 0.00 0.00 8
22 0.00 0.00 0.00 20
23 0.00 0.00 0.00 42
24 0.00 0.00 0.00 6
25 0.00 0.00 0.00 24
27 0.00 0.00 0.00 14
28 0.00 0.00 0.00 12
29 0.00 0.00 0.00 1
30 0.00 0.00 0.00 2
31 0.00 0.00 0.00 2
32 0.00 0.00 0.00 2
33 0.00 0.00 0.00 6
34 0.00 0.00 0.00 26
35 0.00 0.00 0.00 8
36 0.00 0.00 0.00 10
37 0.00 0.00 0.00 8
39 0.00 0.00 0.00 1
40 0.00 0.00 0.00 12
41 0.00 0.00 0.00 2
42 0.00 0.00 0.00 9
43 0.00 0.00 0.00 6
45 0.00 0.00 0.00 28
46 0.00 0.00 0.00 2
47 0.00 0.00 0.00 1
48 0.00 0.00 0.00 1
49 0.00 0.00 0.00 3
51 0.00 0.00 0.00 1
56 0.00 0.00 0.00 46
57 0.00 0.00 0.00 5
59 0.00 0.00 0.00 5
60 0.00 0.00 0.00 1
62 0.00 0.00 0.00 2
63 0.00 0.00 0.00 1
64 0.00 0.00 0.00 1
66 0.00 0.00 0.00 1
67 0.00 0.00 0.00 18
70 0.00 0.00 0.00 1
72 0.36 0.95 0.53 121
73 0.00 0.00 0.00 56
accuracy 0.52 1700
macro avg 0.03 0.04 0.03 1700
weighted avg 0.34 0.52 0.40 1700
Time taken: 599.168709s to run the model
pickle.dump(model_LTSM, open('LTSMmodel.pkl','wb'))
INFO:tensorflow:Assets written to: ram://9239cba5-c4e9-48f1-918b-083fa0c0ced7/assets
INFO:tensorflow:Assets written to: ram://9239cba5-c4e9-48f1-918b-083fa0c0ced7/assets WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7fb28f0f7c90> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7fb0a25e6c50> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7fb28f126350> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7fb0a2692390> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7fb2187078d0> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7fb2f5246810> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7fb2ec690910> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7fb2ec6aa2d0> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function.